# Ensure relative data paths in scaffold resolve from the workspace root.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Radial enrichment of active H3K27ac chromatin', 'biological_hypothesis': 'H3K27ac-enriched active chromatin occupies a distinct radial nuclear compartment compared with H3K27ac-low chromatin.', 'computable_parameter': 'Robust rank-regression coefficient of tracks.n_rad_score as a function of tracks.H3K27ac across finite spots.', 'analysis_plan': 'Use spots with finite tracks.H3K27ac and tracks.n_rad_score. Within each cell, rank-normalize H3K27ac and n_rad_score to reduce intensity-scale effects, then estimate a pooled robust regression coefficient for n_rad_score versus H3K27ac with cell-wise centering. Assess significance by permuting H3K27ac values within each cell_id and recomputing the coefficient.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nActive chromatin marks may occupy preferential radial nuclear positions relative to inactive or lamina-associated domains.\n\n### Data used\nUse per-spot `H3K27ac`, radial score, spot coordinates, and cell type labels for all available cells.\n\n### Analysis sketch\nFit a rank-based association between `H3K27ac` intensity and `n_rad_score` across spots, optionally residualizing by cell or computing cell-wise slopes before combining them.\n\n### Expected result\nA reproducible nonzero association would indicate that H3K27ac-rich loci are radially polarized rather than randomly distributed.\n\n### Validation checks\nConfirm required fields, sufficient spots per cell type, finite regression coefficient, p-value from a cell-label or within-cell permutation test, runtime limit, deterministic rerun, and a shuffled-H3K27ac negative control.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'tracks.H3K27ac', 'tracks.n_rad_score', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_at_least_9_total_and_at_least_3_per_cell_type', 'minimum_spot_or_trace_count_at_least_1000_finite_spots', 'finite_numeric_output', 'statistical_hypothesis_test_permutation_p_value', 'runtime_under_budget_5_minutes', 'deterministic_rerun_fixed_seed', 'negative_control_or_permutation_shuffle_H3K27ac_within_cell'], 'expected_direction': 'Nonzero coefficient; if active chromatin is interior-enriched, the sign should follow the schema’s radial-score orientation after checking whether higher n_rad_score denotes interior or periphery.', 'complexity': 2, 'idea_id': 'radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection: field availability, finite coverage, and per-cell counts.
import numpy as np
import pandas as pd

tracks_df = cdata.tracks if isinstance(cdata.tracks, pd.DataFrame) else pd.DataFrame(cdata.tracks)
spots_df = cdata.spots.copy() if isinstance(cdata.spots, pd.DataFrame) else pd.DataFrame(cdata.spots)
cells_df = cdata.cells.copy() if isinstance(cdata.cells, pd.DataFrame) else pd.DataFrame(cdata.cells)

needed_spot_cols = ['cell_id']
needed_track_cols = ['H3K27ac', 'n_rad_score']
print('spot columns include:', {col: col in spots_df.columns for col in needed_spot_cols})
print('track columns include:', {col: col in tracks_df.columns for col in needed_track_cols})
print('cells columns include:', {'cell_type': 'cell_type' in cells_df.columns})

preview_cols = ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
preview_cols = [col for col in preview_cols if col in spots_df.columns]
preview = pd.concat([
    spots_df[preview_cols].head(5).reset_index(drop=True),
    tracks_df[needed_track_cols].head(5).reset_index(drop=True),
], axis=1)
print('Preview selected columns:')
display(preview)

finite_mask = (
    pd.to_numeric(tracks_df['H3K27ac'], errors='coerce').replace([np.inf, -np.inf], np.nan).notna()
    & pd.to_numeric(tracks_df['n_rad_score'], errors='coerce').replace([np.inf, -np.inf], np.nan).notna()
    & spots_df['cell_id'].notna()
)
cell_counts = spots_df.loc[finite_mask].groupby('cell_id').size().rename('finite_spots')
cell_counts = cell_counts.to_frame().join(cells_df[['cell_type']], how='left')
print('Finite spot coverage:', int(finite_mask.sum()), 'of', len(spots_df), 'spots')
display(cell_counts.groupby('cell_type')['finite_spots'].agg(['count','sum','min','median','max']))
display(cell_counts.head(12))

spot columns include: {'cell_id': True}
track columns include: {'H3K27ac': True, 'n_rad_score': True}
cells columns include: {'cell_type': True}
Preview selected columns:
   chrom      start        end  ...        name  H3K27ac n_rad_score
0  chr14   30425000   30450000  ...  chr14-1096 -0.57949    0.943566
1   chr2   99725000   99750000  ...   chr2-3864 -0.90438    0.984917
2  chr14   39625000   39650000  ...  chr14-1464 -0.92340    1.000000
3   chr2   99950000   99975000  ...   chr2-3873 -0.94083    1.000000
4  chr14  108450000  108475000  ...  chr14-4210 -0.74748    0.990121

[5 rows x 8 columns]
Finite spot coverage: 56036 of 56036 spots
           count    sum   min  median    max
cell_type                                   
Bergmann       3  22829  3932  7614.0  11283
Granule        3  12085  3220  4183.0   4682
Purkinje       3  21122  4225  5238.0  11659
         finite_spots cell_type
cell_id                        
1_0_116         11659  Purkinje
1_0_26           4225  Purkinje
1_0_34           3932  Bergmann
1_0_37           5238  Purkinje
1_0_42           4183   Granule
1_0_47           4682   Granule
1_0_61          11283  Bergmann
1_0_63           7614  Bergmann
1_0_69           3220   Granule

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b.ipynb:29: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "id": "workspace_context",

# Main exploration: within-cell rank association and bounded permutation test.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import Image as IPyImage, display
import numpy as np
import pandas as pd
import json
from pathlib import Path

rng = np.random.default_rng(20250318)
N_PERMUTATIONS = 500
MIN_CELLS = 9
MIN_FINITE_SPOTS = 1000

# Assemble the finite per-spot analysis table.
tracks_df = cdata.tracks if isinstance(cdata.tracks, pd.DataFrame) else pd.DataFrame(cdata.tracks)
spots_df = cdata.spots.copy() if isinstance(cdata.spots, pd.DataFrame) else pd.DataFrame(cdata.spots)
cells_df = cdata.cells.copy() if isinstance(cdata.cells, pd.DataFrame) else pd.DataFrame(cdata.cells)

analysis_df = pd.DataFrame({
    'cell_id': spots_df['cell_id'].astype(str).to_numpy(),
    'H3K27ac': pd.to_numeric(tracks_df['H3K27ac'], errors='coerce').to_numpy(),
    'n_rad_score': pd.to_numeric(tracks_df['n_rad_score'], errors='coerce').to_numpy(),
})
if 'chrom' in spots_df.columns:
    analysis_df['chrom'] = spots_df['chrom'].astype(str).to_numpy()
cell_type_map = cells_df['cell_type'].astype(str).to_dict() if 'cell_type' in cells_df.columns else {}
analysis_df['cell_type'] = analysis_df['cell_id'].map(cell_type_map)
analysis_df = analysis_df.replace([np.inf, -np.inf], np.nan).dropna(subset=['cell_id', 'H3K27ac', 'n_rad_score', 'cell_type']).copy()

# Rank-normalize within each cell to reduce cell-scale and intensity-scale effects.
analysis_df['h3_rank'] = analysis_df.groupby('cell_id', observed=True)['H3K27ac'].rank(method='average', pct=True)
analysis_df['rad_rank'] = analysis_df.groupby('cell_id', observed=True)['n_rad_score'].rank(method='average', pct=True)
analysis_df['h3_centered_rank'] = analysis_df['h3_rank'] - analysis_df.groupby('cell_id', observed=True)['h3_rank'].transform('mean')
analysis_df['rad_centered_rank'] = analysis_df['rad_rank'] - analysis_df.groupby('cell_id', observed=True)['rad_rank'].transform('mean')

valid_cells = analysis_df.groupby('cell_id', observed=True).size()
selected_cells = valid_cells[valid_cells >= 3].index.tolist()
analysis_df = analysis_df[analysis_df['cell_id'].isin(selected_cells)].copy()

x = analysis_df['h3_centered_rank'].to_numpy(dtype=float)
y = analysis_df['rad_centered_rank'].to_numpy(dtype=float)

def slope_from_arrays(x_arr, y_arr):
    denom = float(np.dot(x_arr, x_arr))
    if denom <= 0 or not np.isfinite(denom):
        return np.nan
    return float(np.dot(x_arr, y_arr) / denom)

def corr_from_arrays(x_arr, y_arr):
    denom = float(np.sqrt(np.dot(x_arr, x_arr) * np.dot(y_arr, y_arr)))
    if denom <= 0 or not np.isfinite(denom):
        return np.nan
    return float(np.dot(x_arr, y_arr) / denom)

observed_slope = slope_from_arrays(x, y)
observed_corr = corr_from_arrays(x, y)

# Cell-level slopes for interpretability; the pooled test remains within-cell permuted.
cell_rows = []
for cell_id, grp in analysis_df.groupby('cell_id', observed=True):
    gx = grp['h3_centered_rank'].to_numpy(dtype=float)
    gy = grp['rad_centered_rank'].to_numpy(dtype=float)
    cell_rows.append({
        'cell_id': cell_id,
        'cell_type': str(grp['cell_type'].iloc[0]),
        'n_finite_spots': int(len(grp)),
        'cell_rank_slope': slope_from_arrays(gx, gy),
        'cell_rank_correlation': corr_from_arrays(gx, gy),
        'median_H3K27ac': float(np.nanmedian(grp['H3K27ac'])),
        'median_n_rad_score': float(np.nanmedian(grp['n_rad_score'])),
    })
cell_summary = pd.DataFrame(cell_rows).sort_values(['cell_type', 'cell_id']).reset_index(drop=True)

# Permute H3K27ac centered ranks within each cell. This keeps radial scores, cell sizes, and cell identities fixed.
cell_indices = [idx.to_numpy() for _, idx in analysis_df.groupby('cell_id', observed=True).groups.items()]
x_base = x.copy()
null_slopes = np.empty(N_PERMUTATIONS, dtype=float)
for b in range(N_PERMUTATIONS):
    x_perm = x_base.copy()
    for idx in cell_indices:
        x_perm[idx] = rng.permutation(x_perm[idx])
    null_slopes[b] = slope_from_arrays(x_perm, y)

if len(analysis_df) >= MIN_FINITE_SPOTS and len(selected_cells) >= MIN_CELLS and np.isfinite(observed_slope):
    p_value = float((1 + np.sum(np.abs(null_slopes) >= abs(observed_slope))) / (N_PERMUTATIONS + 1))
    hypothesis_test_status = 'pass'
    insuff_note = ''
else:
    p_value = float('nan')
    hypothesis_test_status = 'insufficient_data'
    insuff_note = f'Need >= {MIN_FINITE_SPOTS} finite spots and >= {MIN_CELLS} cells; found {len(analysis_df)} spots and {len(selected_cells)} cells.'

null_mean = float(np.nanmean(null_slopes))
null_sd = float(np.nanstd(null_slopes, ddof=1))
permutation_z = float((observed_slope - null_mean) / null_sd) if null_sd > 0 else float('nan')

null_hypothesis = 'Within each cell, H3K27ac rank is exchangeable across finite spots with respect to n_rad_score rank, giving no within-cell radial association.'
alternative_hypothesis = 'H3K27ac rank is non-randomly associated with n_rad_score rank within cells, producing a nonzero pooled centered rank-regression coefficient.'
test_method = f'Within-cell H3K27ac-rank permutation test ({N_PERMUTATIONS} permutations, two-sided)'

result_table = pd.DataFrame([{
    'idea_id': IDEA.idea_id,
    'n_rows': int(len(analysis_df)),
    'n_selected_cells': int(len(selected_cells)),
    'n_cell_types': int(analysis_df['cell_type'].nunique()),
    'observed_statistic': float(observed_slope),
    'effect_size': float(observed_slope),
    'rank_correlation_effect_size': float(observed_corr),
    'null_mean': null_mean,
    'null_sd': null_sd,
    'permutation_z': permutation_z,
    'p_value': p_value,
    'test_method': test_method,
    'hypothesis_test_status': hypothesis_test_status,
}])

result_path = RUN_OUTPUT_DIR / 'radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b_result.csv'
figure_path = RUN_OUTPUT_DIR / 'radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b_statistical_summary.png'
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'n_rows': int(len(analysis_df)),
    'n_selected_cells': int(len(selected_cells)),
    'n_cell_types': int(analysis_df['cell_type'].nunique()),
    'cell_types': sorted(map(str, analysis_df['cell_type'].dropna().unique().tolist())),
    'parameter_name': 'pooled_within_cell_centered_rank_slope_n_rad_score_vs_H3K27ac',
    'parameter_value': float(observed_slope),
    'observed_statistic': float(observed_slope),
    'effect_size': float(observed_slope),
    'rank_correlation_effect_size': float(observed_corr),
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_permutations': int(N_PERMUTATIONS),
    'null_distribution_mean': null_mean,
    'null_distribution_sd': null_sd,
    'permutation_z': permutation_z,
    'expected_direction_note': 'Schema context does not specify whether higher n_rad_score denotes interior or periphery; sign indicates radial polarization direction only.',
    'result_path': str(result_path),
    'figure_path': str(figure_path),
    'notes': [insuff_note] if insuff_note else [],
}

# Scientific statistical figure: observed statistic compared with null distribution and cell-level summaries.
plt.rcParams.update({
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.edgecolor': 'black',
    'axes.labelsize': 10,
    'axes.titlesize': 11,
    'legend.fontsize': 9,
})
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), constrained_layout=True)
ax = axes[0]
ax.hist(null_slopes, bins=34, color='#b8c7d9', edgecolor='#3d4b5c', alpha=0.95, label='Within-cell shuffled null')
ax.axvline(observed_slope, color='#b22222', lw=2.2, label='Observed pooled slope')
ax.axvline(0, color='black', lw=1.0, ls='--', label='Zero association')
ax.set_xlabel('Centered rank-regression slope\n(n_rad_score rank vs H3K27ac rank)')
ax.set_ylabel('Permutation count')
ax.set_title('Hypothesis-test evidence')
ax.legend(loc='best', frameon=False)
annotation = (f'p = {p_value:.4g}\n'
              f'effect slope = {observed_slope:.4f}\n'
              f'rank r = {observed_corr:.4f}\n'
              f'n = {len(analysis_df):,} spots, {len(selected_cells)} cells\n'
              f'{N_PERMUTATIONS} permutations')
ax.text(0.03, 0.97, annotation, transform=ax.transAxes, va='top', ha='left',
        bbox=dict(boxstyle='round,pad=0.35', facecolor='white', edgecolor='#777777', alpha=0.95))

ax2 = axes[1]
cell_type_order = sorted(cell_summary['cell_type'].unique())
positions = np.arange(len(cell_type_order))
for i, ct in enumerate(cell_type_order):
    vals = cell_summary.loc[cell_summary['cell_type'] == ct, 'cell_rank_slope'].to_numpy(float)
    jitter = np.linspace(-0.08, 0.08, len(vals)) if len(vals) > 1 else np.array([0.0])
    ax2.scatter(np.full(len(vals), i) + jitter, vals, s=55, alpha=0.9, label=ct)
    ax2.plot([i - 0.18, i + 0.18], [np.nanmedian(vals), np.nanmedian(vals)], color='black', lw=2)
ax2.axhline(0, color='black', lw=1.0, ls='--')
ax2.set_xticks(positions, cell_type_order, rotation=20, ha='right')
ax2.set_ylabel('Cell-level centered rank slope')
ax2.set_xlabel('Cell type (3 cells each)')
ax2.set_title('Cell-level direction check')
ax2.legend(loc='best', frameon=False, title='Cell type')
fig.suptitle('Radial enrichment of active H3K27ac chromatin: rank-based within-cell test', y=1.02, fontsize=12)
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
plt.show()
display(IPyImage(filename=str(figure_path)))

print('result_table:')
display(result_table)
print('cell_summary:')
display(cell_summary)
print('analysis_summary:')
print(json.dumps(analysis_summary, indent=2))

<IPython.core.display.Image object>
result_table:
                                             idea_id  ...  hypothesis_test_status
0  radial-enrichment-of-active-h3k27ac-chromatin-...  ...                    pass

[1 rows x 13 columns]
cell_summary:
   cell_id cell_type  ...  median_H3K27ac  median_n_rad_score
0   1_0_34  Bergmann  ...        0.130760            0.797328
1   1_0_61  Bergmann  ...        0.201850            0.775931
2   1_0_63  Bergmann  ...        0.379870            0.757641
3   1_0_42   Granule  ...       -0.009423            0.801643
4   1_0_47   Granule  ...        0.106500            0.780882
5   1_0_69   Granule  ...        0.063118            0.791818
6  1_0_116  Purkinje  ...        0.265230            0.746364
7   1_0_26  Purkinje  ...        0.462360            0.719498
8   1_0_37  Purkinje  ...        0.477540            0.739870

[9 rows x 7 columns]
analysis_summary:
{
  "idea_id": "radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b",
  "n_rows": 56036,
  "n_selected_cells": 9,
  "n_cell_types": 3,
  "cell_types": [
    "Bergmann",
    "Granule",
    "Purkinje"
  ],
  "parameter_name": "pooled_within_cell_centered_rank_slope_n_rad_score_vs_H3K27ac",
  "parameter_value": -0.5242243049105262,
  "observed_statistic": -0.5242243049105262,
  "effect_size": -0.5242243049105262,
  "rank_correlation_effect_size": -0.5242242983523263,
  "p_value": 0.001996007984031936,
  "test_method": "Within-cell H3K27ac-rank permutation test (500 permutations, two-sided)",
  "null_hypothesis": "Within each cell, H3K27ac rank is exchangeable across finite spots with respect to n_rad_score rank, giving no within-cell radial association.",
  "alternative_hypothesis": "H3K27ac rank is non-randomly associated with n_rad_score rank within cells, producing a nonzero pooled centered rank-regression coefficient.",
  "hypothesis_test_status": "pass",
  "n_permutations": 500,
  "null_distribution_mean": 0.00043183037922183565,
  "null_distribution_sd": 0.004039928747906687,
  "permutation_z": -129.86767045374293,
  "expected_direction_note": "Schema context does not specify whether higher n_rad_score denotes interior or periphery; sign indicates radial polarization direction only.",
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b_result.csv",
  "figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b_statistical_summary.png",
  "notes": []
}

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b.ipynb:193: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  "source": [

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_at_least_9_total_and_at_least_3_per_cell_type": "not_run",
    "minimum_spot_or_trace_count_at_least_1000_finite_spots": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_permutation_p_value": "not_run",
    "runtime_under_budget_5_minutes": "not_run",
    "deterministic_rerun_fixed_seed": "not_run",
    "negative_control_or_permutation_shuffle_H3K27ac_within_cell": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.5242243049105262,
  "p_value": 0.001996007984031936,
  "test_method": "Within-cell H3K27ac-rank permutation test (500 permutations, two-sided)",
  "effect_size": -0.5242243049105262,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/radial-enrichment-of-active-h3k27ac-chromatin-65ea38bd9b_result.csv",
  "notes": []
}

Auto-discovery idea: Radial enrichment of active H3K27ac chromatin¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶