# Ensure relative paths in the scaffold resolve from the U-Chrom project root.
from pathlib import Path
import os
root = Path('/Users/weizexu/Projects/U-Chrom')
os.chdir(root)
print('cwd:', Path.cwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Pcp2 expression predicts H3K27ac-marked chromatin spatial clustering', 'biological_hypothesis': 'Higher Pcp2 RNA expression is associated with stronger 3D clustering of H3K27ac-enriched chromatin, reflecting coordinated active regulatory compartmentalization in Purkinje-like cells.', 'computable_parameter': 'Spearman rho between per-cell Pcp2 expression and per-cell mean pairwise 3D distance among top-quartile tracks.H3K27ac spots within traces.', 'analysis_plan': 'Align cells.index to linked_adata obs_names, extract linked_adata.X values for linked_adata.var.Pcp2, and use spots.cell_id plus spots.trace_id to group coords by cell and trace. Within each cell, select top-quartile tracks.H3K27ac spots per trace where enough spots exist, compute all pairwise Euclidean distances from coords, average within traces, then average to a cell-level value. Test Spearman association between Pcp2 expression and this single distance parameter across the 9 cells; use deterministic quantile thresholds and a fixed random seed for permutations.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata', 'rna_expression'], 'idea_markdown': '### Rationale\nPcp2 is present in the linked RNA matrix and can be used as a Purkinje-associated expression axis across the 9 linked cells.\n\n### Data used\nUse 3D chromatin coordinates, spot-to-cell and spot-to-trace mappings, H3K27ac IF track intensity, cell type labels, and linked Pcp2 expression.\n\n### Analysis sketch\nFor each cell, identify H3K27ac-high spots within each trace and compute their mean pairwise 3D distance; summarize to one cell-level H3K27ac-cluster distance. Correlate this distance with Pcp2 expression across cells.\n\n### Expected result\nCells with higher Pcp2 expression are expected to show shorter distances among H3K27ac-high chromatin spots, consistent with active chromatin clustering.\n\n### Validation checks\nConfirm fields and cell counts, require finite distances, test Spearman correlation with a p-value, rerun deterministically, and compare against permuted Pcp2 cell labels.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'spots.trace_id', 'tracks.H3K27ac', 'cells.cell_type', 'linked_adata.X', 'linked_adata.var.Pcp2'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3', 'minimum_spot_or_trace_count_per_cell_for_H3K27ac_high_distance', 'finite_numeric_output', 'statistical_hypothesis_test_spearman_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation_by_shuffling_Pcp2_expression_across_cells'], 'expected_direction': 'Negative correlation: higher Pcp2 expression should correspond to smaller H3K27ac-high chromatin distances.', 'complexity': 4, 'idea_id': 'pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight inspection: alignment, selected columns, and finite-value coverage.
import numpy as np
import pandas as pd

print('cells index:', list(map(str, cdata.cells.index)))
print('adata obs_names:', list(map(str, adata.obs_names)))
print('cell/adata aligned:', list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names)))
print('cell_type counts:', cdata.cells['cell_type'].value_counts().to_dict())
print('spots columns:', list(cdata.spots.columns))
print('coords shape:', cdata.coords.shape, 'tracks shape:', cdata.tracks.shape)
print('H3K27ac finite:', int(np.isfinite(cdata.tracks['H3K27ac'].to_numpy()).sum()), '/', len(cdata.tracks))
print('spot cell_id non-null:', int(cdata.spots['cell_id'].notna().sum()), '/', len(cdata.spots))
print('spot trace_id non-null:', int(cdata.spots['trace_id'].notna().sum()), '/', len(cdata.spots))
print('Pcp2 in genes:', 'Pcp2' in list(map(str, adata.var_names)))
# Preview a few aligned rows without printing the full dataset.
preview = cdata.cells[['cell_type']].copy()
preview['n_spots_by_cell'] = cdata.spots.groupby('cell_id').size().reindex(cdata.cells.index).fillna(0).astype(int)
print(preview.head(9).to_string())

cells index: ['1_0_42', '1_0_47', '1_0_69', '1_0_34', '1_0_61', '1_0_63', '1_0_26', '1_0_37', '1_0_116']
adata obs_names: ['1_0_42', '1_0_47', '1_0_69', '1_0_34', '1_0_61', '1_0_63', '1_0_26', '1_0_37', '1_0_116']
cell/adata aligned: True
cell_type counts: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
spots columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
coords shape: (56036, 3) tracks shape: (56036, 62)
H3K27ac finite: 56036 / 56036
spot cell_id non-null: 56036 / 56036
spot trace_id non-null: 56036 / 56036
Pcp2 in genes: True
        cell_type  n_spots_by_cell
cell_id                           
1_0_42    Granule             4183
1_0_47    Granule             4682
1_0_69    Granule             3220
1_0_34   Bergmann             3932
1_0_61   Bergmann            11283
1_0_63   Bergmann             7614
1_0_26   Purkinje             4225
1_0_37   Purkinje             5238
1_0_116  Purkinje            11659

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250.ipynb:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "For each cell, identify H3K27ac-high spots within each trace and compute their mean pairwise 3D distance; summarize to one cell-level H3K27ac-cluster distance. Correlate this distance with Pcp2 expression across cells.\n",

# Main compact analysis for the Pcp2/H3K27ac-high chromatin-clustering idea.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
from scipy import sparse, stats

rng = np.random.default_rng(667250)
result_path = RUN_OUTPUT_DIR / 'pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250_result.csv'
figure_path = RUN_OUTPUT_DIR / 'pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250_statistical_summary.png'

# Extract aligned Pcp2 expression.
cell_ids = list(map(str, cdata.cells.index))
adata_obs = list(map(str, adata.obs_names))
aligned = cell_ids == adata_obs
if not aligned:
    raise ValueError('cdata.cells.index and linked_adata.obs_names are not aligned; explicit reindexing would be required.')
if 'Pcp2' not in list(map(str, adata.var_names)):
    raise ValueError('Pcp2 is not present in linked_adata.var_names')
pcp2_x = adata[:, ['Pcp2']].X
if sparse.issparse(pcp2_x):
    pcp2 = np.asarray(pcp2_x.toarray()).ravel()
else:
    pcp2 = np.asarray(pcp2_x).ravel()

# Build a lightweight spot table aligned to coords/tracks.
coords = np.asarray(cdata.coords, dtype=float)
h3 = cdata.tracks['H3K27ac'].to_numpy(dtype=float)
spot_df = cdata.spots[['cell_id', 'trace_id']].copy()
spot_df['cell_id'] = spot_df['cell_id'].astype(str)
spot_df['trace_id'] = spot_df['trace_id'].astype(str)
spot_df['x_um'] = coords[:, 0]
spot_df['y_um'] = coords[:, 1]
spot_df['z_um'] = coords[:, 2]
spot_df['H3K27ac'] = h3

# For each trace in each cell, select H3K27ac top-quartile spots and compute their mean pairwise distance.
trace_rows = []
for (cell_id, trace_id), g in spot_df.groupby(['cell_id', 'trace_id'], sort=False, observed=False):
    vals = g['H3K27ac'].to_numpy(dtype=float)
    xyz = g[['x_um', 'y_um', 'z_um']].to_numpy(dtype=float)
    finite = np.isfinite(vals) & np.isfinite(xyz).all(axis=1)
    vals = vals[finite]
    xyz = xyz[finite]
    if len(vals) < 4:
        continue
    q75 = float(np.quantile(vals, 0.75))
    high = vals >= q75
    if int(high.sum()) < 3:
        continue
    high_xyz = xyz[high]
    # No all-vs-all over >5000 rows: top-quartile groups are small in this subset; guard anyway.
    if high_xyz.shape[0] > 5000:
        idx = rng.choice(high_xyz.shape[0], size=5000, replace=False)
        high_xyz = high_xyz[idx]
    diffs = high_xyz[:, None, :] - high_xyz[None, :, :]
    dmat = np.sqrt(np.sum(diffs * diffs, axis=2))
    iu = np.triu_indices(high_xyz.shape[0], k=1)
    mean_dist = float(np.mean(dmat[iu]))
    trace_rows.append({
        'cell_id': cell_id,
        'trace_id': trace_id,
        'n_trace_spots': int(len(vals)),
        'n_h3k27ac_high_spots': int(high_xyz.shape[0]),
        'h3k27ac_q75': q75,
        'trace_mean_pairwise_distance_um': mean_dist,
    })

trace_table = pd.DataFrame(trace_rows)
if trace_table.empty:
    raise RuntimeError('No eligible cell/trace groups had enough H3K27ac-high spots for distances.')

cell_distance = (
    trace_table.groupby('cell_id', as_index=True)
    .agg(
        mean_h3k27ac_high_pairwise_distance_um=('trace_mean_pairwise_distance_um', 'mean'),
        median_h3k27ac_high_pairwise_distance_um=('trace_mean_pairwise_distance_um', 'median'),
        n_eligible_traces=('trace_id', 'nunique'),
        mean_h3k27ac_high_spots_per_trace=('n_h3k27ac_high_spots', 'mean'),
    )
    .reindex(cell_ids)
)

result_table = cdata.cells[['cell_type']].copy()
result_table.index = result_table.index.astype(str)
result_table['pcp2_expression'] = pcp2
result_table = result_table.join(cell_distance)
result_table['cell_id'] = result_table.index

valid_mask = np.isfinite(result_table['pcp2_expression'].to_numpy(dtype=float)) & np.isfinite(result_table['mean_h3k27ac_high_pairwise_distance_um'].to_numpy(dtype=float))
valid = result_table.loc[valid_mask].copy()
n_valid = int(len(valid))
min_required = 4
null_hypothesis = 'Across linked cells, Pcp2 expression is not negatively associated with mean pairwise 3D distance among within-trace H3K27ac-top-quartile chromatin spots (Spearman rho >= 0).'
alternative_hypothesis = 'Higher Pcp2 expression is associated with shorter mean pairwise 3D distance among H3K27ac-top-quartile chromatin spots (Spearman rho < 0).'
test_method = 'Spearman correlation, one-sided negative; 1000 deterministic label-shuffle permutations as null/control distribution'

if n_valid >= min_required and valid['pcp2_expression'].nunique() >= 2 and valid['mean_h3k27ac_high_pairwise_distance_um'].nunique() >= 2:
    spearman_res = stats.spearmanr(
        valid['pcp2_expression'].to_numpy(dtype=float),
        valid['mean_h3k27ac_high_pairwise_distance_um'].to_numpy(dtype=float),
        alternative='less',
    )
    observed_rho = float(spearman_res.statistic)
    scipy_p = float(spearman_res.pvalue)
    n_perm = 1000
    null_rhos = np.empty(n_perm, dtype=float)
    y = valid['mean_h3k27ac_high_pairwise_distance_um'].to_numpy(dtype=float)
    x = valid['pcp2_expression'].to_numpy(dtype=float)
    for i in range(n_perm):
        x_perm = rng.permutation(x)
        null_rhos[i] = float(stats.spearmanr(x_perm, y).statistic)
    permutation_p_value = float((1 + np.sum(null_rhos <= observed_rho)) / (n_perm + 1))
    p_value = scipy_p
    hypothesis_test_status = 'pass'
else:
    observed_rho = 0.0
    scipy_p = 1.0
    n_perm = 0
    null_rhos = np.array([], dtype=float)
    permutation_p_value = 1.0
    p_value = 1.0
    hypothesis_test_status = 'insufficient_data'

# Effect size is Spearman rho; negative values support the expected clustering direction.
effect_size = float(observed_rho)
result_table['observed_statistic'] = observed_rho
result_table['effect_size_spearman_rho'] = effect_size
result_table['p_value'] = p_value
result_table['permutation_p_value'] = permutation_p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'n_rows': int(len(result_table)),
    'n_selected_cells': n_valid,
    'n_eligible_traces_total': int(trace_table['trace_id'].nunique()),
    'parameter_name': 'Spearman rho: Pcp2 expression vs mean H3K27ac-high pairwise distance',
    'parameter_value': observed_rho,
    'observed_statistic': observed_rho,
    'effect_size': effect_size,
    'p_value': p_value,
    'permutation_p_value': permutation_p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'expected_direction': 'negative',
    'result_path': str(result_path),
    'statistical_figure_path': str(figure_path),
    'notes': [
        f'Computed per-cell distance from {len(trace_table)} eligible cell-trace groups.',
        'Distances are in micrometers because schema xyz_unit is um.',
        'Permutation null shuffles Pcp2 labels across the selected cells with a fixed local RNG seed.',
    ],
}

# Statistical figure: cell-level comparison and permutation null distribution.
fig, axes = plt.subplots(1, 2, figsize=(10.5, 4.2), dpi=160)
fig.patch.set_facecolor('white')
for ax in axes:
    ax.set_facecolor('white')

colors = {'Granule': '#4C78A8', 'Bergmann': '#F58518', 'Purkinje': '#54A24B'}
for ct, sub in valid.groupby('cell_type', sort=True):
    axes[0].scatter(
        sub['pcp2_expression'],
        sub['mean_h3k27ac_high_pairwise_distance_um'],
        s=58,
        color=colors.get(ct, '0.35'),
        label=f'{ct} (n={len(sub)})',
        edgecolor='black',
        linewidth=0.5,
        alpha=0.9,
    )
if n_valid >= 2:
    # Visual trend line is ordinary least squares for display only; hypothesis test is Spearman.
    x = valid['pcp2_expression'].to_numpy(dtype=float)
    y = valid['mean_h3k27ac_high_pairwise_distance_um'].to_numpy(dtype=float)
    if np.unique(x).size > 1:
        m, b = np.polyfit(x, y, 1)
        xs = np.linspace(float(np.min(x)), float(np.max(x)), 50)
        axes[0].plot(xs, m * xs + b, color='black', lw=1.2, label='OLS guide')
axes[0].set_xlabel('Pcp2 expression (linked RNA matrix)')
axes[0].set_ylabel('Mean pairwise distance among H3K27ac-high spots (um)')
axes[0].set_title('Cell-level association')
axes[0].legend(frameon=False, fontsize=7)
axes[0].grid(alpha=0.2, linewidth=0.5)

if len(null_rhos):
    axes[1].hist(null_rhos, bins=24, color='#9ecae1', edgecolor='white', label='Shuffled Pcp2 labels')
    axes[1].axvline(observed_rho, color='#d62728', lw=2.0, label=f'Observed rho={observed_rho:.3f}')
    axes[1].axvline(0, color='0.25', lw=1.0, ls='--', label='rho=0')
else:
    axes[1].text(0.5, 0.5, 'Insufficient data for permutation null', ha='center', va='center', transform=axes[1].transAxes)
axes[1].set_xlabel('Spearman rho under shuffled labels')
axes[1].set_ylabel('Permutation count')
axes[1].set_title('Hypothesis-test evidence')
axes[1].legend(frameon=False, fontsize=7)
axes[1].grid(alpha=0.2, linewidth=0.5)

annotation = f"Spearman one-sided p={p_value:.3g}\nperm p={permutation_p_value:.3g}; n={n_valid} cells\neffect size rho={effect_size:.3f}"
axes[1].text(0.02, 0.98, annotation, transform=axes[1].transAxes, va='top', ha='left', fontsize=8,
             bbox=dict(facecolor='white', edgecolor='0.7', boxstyle='round,pad=0.25'))
fig.suptitle('Pcp2 expression vs H3K27ac-marked chromatin spatial clustering', fontsize=11)
fig.tight_layout(rect=[0, 0, 1, 0.93])
fig.savefig(figure_path, bbox_inches='tight', facecolor='white')
display(Image(filename=str(figure_path)))
plt.close(fig)

print('result_table:')
display(result_table)
print('analysis_summary:')
print(json.dumps(analysis_summary, indent=2))

<IPython.core.display.Image object>
result_table:
        cell_type  ...  hypothesis_test_status
cell_id            ...                        
1_0_42    Granule  ...                    pass
1_0_47    Granule  ...                    pass
1_0_69    Granule  ...                    pass
1_0_34   Bergmann  ...                    pass
1_0_61   Bergmann  ...                    pass
1_0_63   Bergmann  ...                    pass
1_0_26   Purkinje  ...                    pass
1_0_37   Purkinje  ...                    pass
1_0_116  Purkinje  ...                    pass

[9 rows x 13 columns]
analysis_summary:
{
  "idea_id": "pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250",
  "n_rows": 9,
  "n_selected_cells": 9,
  "n_eligible_traces_total": 213,
  "parameter_name": "Spearman rho: Pcp2 expression vs mean H3K27ac-high pairwise distance",
  "parameter_value": 0.870300505105952,
  "observed_statistic": 0.870300505105952,
  "effect_size": 0.870300505105952,
  "p_value": 0.998863110802371,
  "permutation_p_value": 0.996003996003996,
  "test_method": "Spearman correlation, one-sided negative; 1000 deterministic label-shuffle permutations as null/control distribution",
  "null_hypothesis": "Across linked cells, Pcp2 expression is not negatively associated with mean pairwise 3D distance among within-trace H3K27ac-top-quartile chromatin spots (Spearman rho >= 0).",
  "alternative_hypothesis": "Higher Pcp2 expression is associated with shorter mean pairwise 3D distance among H3K27ac-top-quartile chromatin spots (Spearman rho < 0).",
  "hypothesis_test_status": "pass",
  "expected_direction": "negative",
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250_statistical_summary.png",
  "notes": [
    "Computed per-cell distance from 213 eligible cell-trace groups.",
    "Distances are in micrometers because schema xyz_unit is um.",
    "Permutation null shuffles Pcp2 labels across the selected cells with a fixed local RNG seed."
  ]
}

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3": "not_run",
    "minimum_spot_or_trace_count_per_cell_for_H3K27ac_high_distance": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_spearman_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation_by_shuffling_Pcp2_expression_across_cells": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": 0.870300505105952,
  "p_value": 0.998863110802371,
  "test_method": "Spearman correlation, one-sided negative; 1000 deterministic label-shuffle permutations as null/control distribution",
  "effect_size": 0.870300505105952,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-expression-predicts-h3k27ac-marked-chromati-667c6ee250_result.csv",
  "notes": [
    "Computed per-cell distance from 213 eligible cell-trace groups.",
    "Distances are in micrometers because schema xyz_unit is um.",
    "Permutation null shuffles Pcp2 labels across the selected cells with a fixed local RNG seed."
  ]
}

Auto-discovery idea: Pcp2 expression predicts H3K27ac-marked chromatin spatial clustering¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶