from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Purkinje-specific H3K27ac decompaction along traced chromosomes', 'biological_hypothesis': 'Purkinje cells show stronger local 3D decompaction at H3K27ac-high chromatin bins than Granule or Bergmann cells.', 'computable_parameter': 'Purkinje_H3K27ac_decompaction_interaction = median_trace_delta_Purkinje - median_trace_delta_nonPurkinje, where each trace_delta is median adjacent 3D distance for top-quartile H3K27ac spots minus bottom-quartile H3K27ac spots within the same trace and chromosome.', 'analysis_plan': 'Filter spots to traces with ordered bins on the same chromosome, compute Euclidean adjacent-bin distances from coords after sorting by spots.start, label H3K27ac top and bottom quartiles within comparable trace/chromosome strata, compute trace-level deltas, then test the Purkinje versus non-Purkinje interaction by permutation of cell_type labels or a rank-sum test on trace-level deltas with a reported p-value.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nPurkinje cells may place active H3K27ac-rich chromatin into a more open 3D configuration than Granule or Bergmann cells.\n\n### Data used\nUse traced 3D coordinates ordered by chromosome and genomic start, stratified by cell type, with spot-level H3K27ac intensity.\n\n### Analysis sketch\nWithin each trace and chromosome, compare adjacent-bin 3D distances for top-quartile versus bottom-quartile H3K27ac spots, then aggregate as a Purkinje-versus-non-Purkinje interaction effect.\n\n### Expected result\nThe active-chromatin decompaction effect is expected to be more positive in Purkinje cells, indicating larger adjacent-bin distances at H3K27ac-high loci.\n\n### Validation checks\nRequire enough cells and traces per group, finite distances, an exact/permutation p-value, deterministic rerun, runtime logging, and a marker-label permutation control.', 'cell_types': ['Purkinje', 'Granule', 'Bergmann'], 'required_fields': ['coords', 'spots.cell_id', 'spots.trace_id', 'spots.chrom', 'spots.start', 'cells.cell_type', 'tracks.H3K27ac'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'Positive interaction: H3K27ac-high adjacent-bin distances exceed H3K27ac-low distances more strongly in Purkinje cells than in Granule or Bergmann cells.', 'complexity': 4, 'idea_id': 'purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f', 'metadata': {}})
H5CD_PATH = Path('tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd')
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
if not H5CD_PATH.exists():
    project_root = Path('/Users/weizexu/Projects/U-Chrom')
    H5CD_PATH = project_root / H5CD_PATH
    RUN_OUTPUT_DIR = project_root / RUN_OUTPUT_DIR
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight schema/data inspection for alignment and finite coverage
import os; os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd

spots_preview = cdata.spots[['cell_id', 'trace_id', 'chrom', 'start']].head()
cells_by_type = cdata.cells['cell_type'].value_counts().sort_index()
h3 = np.asarray(cdata.tracks['H3K27ac'], dtype=float)
coords = np.asarray(cdata.coords, dtype=float)
inspection_summary = {
    'n_spots': int(len(cdata.spots)),
    'n_traces': int(cdata.n_traces),
    'n_cells': int(cdata.n_cells),
    'cell_type_counts': cells_by_type.to_dict(),
    'coords_shape': tuple(coords.shape),
    'h3k27ac_shape': tuple(h3.shape),
    'finite_coord_fraction': float(np.isfinite(coords).mean()),
    'finite_h3k27ac_fraction': float(np.isfinite(h3).mean()),
    'trace_chrom_groups': int(cdata.spots.groupby(['trace_id', 'chrom']).ngroups),
}
print(json.dumps(inspection_summary, indent=2))
display(spots_preview)
display(cells_by_type.to_frame('n_cells'))

{
  "n_spots": 56036,
  "n_traces": 213,
  "n_cells": 9,
  "cell_type_counts": {
    "Bergmann": 3,
    "Granule": 3,
    "Purkinje": 3
  },
  "coords_shape": [
    56036,
    3
  ],
  "h3k27ac_shape": [
    56036
  ],
  "finite_coord_fraction": 1.0,
  "finite_h3k27ac_fraction": 1.0,
  "trace_chrom_groups": 213
}
  cell_id         trace_id  chrom      start
0  1_0_61  1_0_61_chr14_a2  chr14   30425000
1  1_0_61   1_0_61_chr2_a2   chr2   99725000
2  1_0_61  1_0_61_chr14_a1  chr14   39625000
3  1_0_61   1_0_61_chr2_a2   chr2   99950000
4  1_0_61  1_0_61_chr14_a1  chr14  108450000
           n_cells
cell_type         
Bergmann         3
Granule          3
Purkinje         3

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f.ipynb:19: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "### Expected result\n",

# Main compact analysis: trace/chromosome H3K27ac high-vs-low local decompaction and permutation test
import os; os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib; matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
import numpy as np
import pandas as pd
import json
from pathlib import Path

rng = np.random.default_rng(20250220)
project_root = Path('/Users/weizexu/Projects/U-Chrom')
result_rel = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f_result.csv')
fig_rel = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f_statistical_summary.png')
result_path = project_root / result_rel
fig_path = project_root / fig_rel
result_path.parent.mkdir(parents=True, exist_ok=True)

spots = cdata.spots[['cell_id', 'trace_id', 'chrom', 'start']].copy().reset_index(drop=True)
coords = np.asarray(cdata.coords, dtype=float)
h3 = np.asarray(cdata.tracks['H3K27ac'], dtype=float)
spots['H3K27ac'] = h3
spots['x'] = coords[:, 0]
spots['y'] = coords[:, 1]
spots['z'] = coords[:, 2]
cell_type_map = cdata.cells['cell_type'].astype(str).to_dict()
spots['cell_type'] = spots['cell_id'].astype(str).map(cell_type_map)

# Per trace/chromosome: sort by genomic start, compute each spot's local adjacent distance
# as the median of its previous/next Euclidean distances, then compare H3K27ac quartile bins.
rows = []
for (trace_id, chrom), g in spots.groupby(['trace_id', 'chrom'], observed=True, sort=False):
    g = g.sort_values('start').reset_index(drop=True)
    if len(g) < 4:
        continue
    xyz = g[['x', 'y', 'z']].to_numpy(float)
    marker = g['H3K27ac'].to_numpy(float)
    finite = np.isfinite(xyz).all(axis=1) & np.isfinite(marker)
    if finite.sum() < 4:
        continue
    # Keep the ordered finite rows only.
    g = g.loc[finite].reset_index(drop=True)
    xyz = g[['x', 'y', 'z']].to_numpy(float)
    marker = g['H3K27ac'].to_numpy(float)
    if len(g) < 4 or np.nanmax(marker) == np.nanmin(marker):
        continue
    edge_dist = np.sqrt(((xyz[1:] - xyz[:-1]) ** 2).sum(axis=1))
    local_adj = np.full(len(g), np.nan)
    local_adj[0] = edge_dist[0]
    local_adj[-1] = edge_dist[-1]
    if len(g) > 2:
        local_adj[1:-1] = np.nanmedian(np.vstack([edge_dist[:-1], edge_dist[1:]]), axis=0)
    q25, q75 = np.nanquantile(marker, [0.25, 0.75])
    low = marker <= q25
    high = marker >= q75
    if low.sum() < 2 or high.sum() < 2:
        continue
    high_median = float(np.nanmedian(local_adj[high]))
    low_median = float(np.nanmedian(local_adj[low]))
    delta = high_median - low_median
    rows.append({
        'trace_id': str(trace_id),
        'chrom': str(chrom),
        'cell_id': str(g['cell_id'].iloc[0]),
        'cell_type': str(g['cell_type'].iloc[0]),
        'n_spots': int(len(g)),
        'n_high_spots': int(high.sum()),
        'n_low_spots': int(low.sum()),
        'median_adjacent_distance_high_H3K27ac_um': high_median,
        'median_adjacent_distance_low_H3K27ac_um': low_median,
        'trace_delta_um': float(delta),
        'median_H3K27ac_high': float(np.nanmedian(marker[high])),
        'median_H3K27ac_low': float(np.nanmedian(marker[low])),
    })

trace_delta_table = pd.DataFrame(rows)
if trace_delta_table.empty:
    raise RuntimeError('No eligible trace/chromosome groups produced H3K27ac high-low deltas.')
trace_delta_table['is_purkinje'] = trace_delta_table['cell_type'].eq('Purkinje')

pur = trace_delta_table.loc[trace_delta_table['is_purkinje'], 'trace_delta_um'].to_numpy(float)
non = trace_delta_table.loc[~trace_delta_table['is_purkinje'], 'trace_delta_um'].to_numpy(float)
observed_statistic = float(np.nanmedian(pur) - np.nanmedian(non))
effect_size = observed_statistic
n_perm = 500
labels = trace_delta_table['is_purkinje'].to_numpy(bool)
values = trace_delta_table['trace_delta_um'].to_numpy(float)
null_stats = np.empty(n_perm, dtype=float)
for i in range(n_perm):
    shuffled = rng.permutation(labels)
    null_stats[i] = np.nanmedian(values[shuffled]) - np.nanmedian(values[~shuffled])
p_value = float((1 + np.sum(null_stats >= observed_statistic)) / (n_perm + 1))

hypothesis_test_status = 'pass' if (len(pur) >= 2 and len(non) >= 2 and np.isfinite(p_value)) else 'insufficient_data'
test_method = f'one-sided label permutation test on trace/chromosome deltas, {n_perm} permutations, statistic=median(Purkinje)-median(non-Purkinje)'

# Result table: trace-level rows with global test annotations and enough columns to reconstruct the statistic.
result_table = trace_delta_table.copy()
result_table['observed_statistic_um'] = observed_statistic
result_table['effect_size_um'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)

# Statistical figure: null distribution plus group comparison.
plt.rcParams.update({'figure.facecolor': 'white', 'axes.facecolor': 'white', 'font.size': 10})
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), constrained_layout=True)
ax = axes[0]
ax.hist(null_stats, bins=30, color='#d0d0d0', edgecolor='white', label='Permuted labels')
ax.axvline(observed_statistic, color='#b2182b', linewidth=2.2, label='Observed statistic')
ax.axvline(0, color='black', linewidth=1, linestyle=':', label='No interaction')
ax.set_xlabel('Median delta(Purkinje) - median delta(non-Purkinje) (µm)')
ax.set_ylabel('Permutation count')
ax.set_title('Permutation null distribution')
ax.legend(frameon=False)
ax.text(0.02, 0.96, f'p={p_value:.4f}\neffect={effect_size:.4f} µm\nn_perm={n_perm}', transform=ax.transAxes, va='top', ha='left')

ax = axes[1]
group_names = ['non-Purkinje', 'Purkinje']
group_values = [non, pur]
box = ax.boxplot(group_values, labels=group_names, patch_artist=True, showfliers=False)
for patch, color in zip(box['boxes'], ['#67a9cf', '#ef8a62']):
    patch.set_facecolor(color)
    patch.set_alpha(0.65)
for j, vals in enumerate(group_values, start=1):
    jitter = rng.normal(0, 0.035, size=len(vals))
    ax.scatter(np.full(len(vals), j) + jitter, vals, s=18, alpha=0.65, edgecolor='white', linewidth=0.3,
               color=['#2166ac', '#b2182b'][j-1], label=f'{group_names[j-1]} trace/chrom deltas')
ax.axhline(0, color='black', linewidth=1, linestyle=':')
ax.set_ylabel('Trace/chromosome H3K27ac high-low local adjacent distance delta (µm)')
ax.set_title('Trace-level decompaction deltas')
ax.legend(frameon=False, fontsize=8, loc='best')
ax.text(0.02, 0.96, f'n={len(non)} non-Purkinje\nn={len(pur)} Purkinje\nmethod: permutation', transform=ax.transAxes, va='top', ha='left')
fig.suptitle('Purkinje-specific H3K27ac decompaction along traced chromosomes', fontsize=12)
fig.savefig(fig_path, dpi=180, bbox_inches='tight')
plt.close(fig)
display(Image(filename=str(fig_path)))

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'Purkinje_H3K27ac_decompaction_interaction',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': 'Purkinje and non-Purkinje trace/chromosome H3K27ac high-low adjacent-distance deltas are exchangeable; median difference is <= 0.',
    'alternative_hypothesis': 'Purkinje trace/chromosome H3K27ac high-low adjacent-distance deltas are larger than non-Purkinje deltas; median difference is > 0.',
    'hypothesis_test_status': hypothesis_test_status,
    'n_rows': int(len(result_table)),
    'n_selected_cells': int(trace_delta_table['cell_id'].nunique()),
    'n_purkinje_trace_chrom': int(len(pur)),
    'n_nonpurkinje_trace_chrom': int(len(non)),
    'n_permutations': int(n_perm),
    'result_path': str(result_rel),
    'statistical_figure_path': str(fig_rel),
    'notes': [
        'Spot-level local adjacent distances were computed within each trace/chromosome after genomic sorting.',
        'H3K27ac high and low sets use within-trace/chromosome quartiles to reduce global intensity confounding.',
        'Permutation test shuffles Purkinje labels across trace/chromosome deltas as a compact negative control.'
    ],
}
print(json.dumps(analysis_summary, indent=2))
display(result_table.head())

<IPython.core.display.Image object>
{
  "idea_id": "purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f",
  "parameter_name": "Purkinje_H3K27ac_decompaction_interaction",
  "parameter_value": -0.07634956292718045,
  "observed_statistic": -0.07634956292718045,
  "effect_size": -0.07634956292718045,
  "p_value": 1.0,
  "test_method": "one-sided label permutation test on trace/chromosome deltas, 500 permutations, statistic=median(Purkinje)-median(non-Purkinje)",
  "null_hypothesis": "Purkinje and non-Purkinje trace/chromosome H3K27ac high-low adjacent-distance deltas are exchangeable; median difference is <= 0.",
  "alternative_hypothesis": "Purkinje trace/chromosome H3K27ac high-low adjacent-distance deltas are larger than non-Purkinje deltas; median difference is > 0.",
  "hypothesis_test_status": "pass",
  "n_rows": 213,
  "n_selected_cells": 9,
  "n_purkinje_trace_chrom": 68,
  "n_nonpurkinje_trace_chrom": 145,
  "n_permutations": 500,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f_statistical_summary.png",
  "notes": [
    "Spot-level local adjacent distances were computed within each trace/chromosome after genomic sorting.",
    "H3K27ac high and low sets use within-trace/chromosome quartiles to reduce global intensity confounding.",
    "Permutation test shuffles Purkinje labels across trace/chromosome deltas as a compact negative control."
  ]
}
          trace_id  ... hypothesis_test_status
0  1_0_61_chr14_a2  ...                   pass
1   1_0_61_chr2_a2  ...                   pass
2  1_0_61_chr14_a1  ...                   pass
3   1_0_61_chr2_a1  ...                   pass
4   1_0_61_chr6_a2  ...                   pass

[5 rows x 18 columns]

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.07634956292718045,
  "p_value": 1.0,
  "test_method": "one-sided label permutation test on trace/chromosome deltas, 500 permutations, statistic=median(Purkinje)-median(non-Purkinje)",
  "effect_size": -0.07634956292718045,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-specific-h3k27ac-decompaction-along-tra-8f00bdb00f_result.csv",
  "notes": [
    "Spot-level local adjacent distances were computed within each trace/chromosome after genomic sorting.",
    "H3K27ac high and low sets use within-trace/chromosome quartiles to reduce global intensity confounding.",
    "Permutation test shuffles Purkinje labels across trace/chromosome deltas as a compact negative control."
  ]
}

Auto-discovery idea: Purkinje-specific H3K27ac decompaction along traced chromosomes¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶