from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Reln expression predicts spatial coupling of H3K4me1 and CBP chromatin spots', 'biological_hypothesis': 'Higher Reln RNA expression is associated with closer spatial proximity between H3K4me1-enriched chromatin and CBP-enriched chromatin, reflecting enhancer-coactivator coupling in 3D nuclear space.', 'computable_parameter': 'Spearman rho between per-cell Reln expression and per-cell mean nearest-neighbor 3D distance from top-quartile tracks.H3K4me1 spots to top-quartile tracks.CBP spots within the same trace.', 'analysis_plan': 'Extract linked_adata.X values for linked_adata.var.Reln and align them to cells. For each cell and trace, use coords with spots.cell_id and spots.trace_id to select top-quartile H3K4me1 spots and top-quartile CBP spots, compute nearest-neighbor Euclidean distances from H3K4me1-high to CBP-high spots, and average to one value per cell. Test Spearman correlation between Reln expression and the cell-level distance parameter, with deterministic quantile thresholds and fixed-seed expression-label permutations.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata', 'rna_expression'], 'idea_markdown': '### Rationale\nReln is available in linked RNA expression and can be tested against enhancer-associated chromatin features measured by IF tracks.\n\n### Data used\nUse 3D coordinates, cell and trace assignments, H3K4me1 and CBP spot-level tracks, cell type labels, and linked Reln expression.\n\n### Analysis sketch\nWithin each cell and trace, measure the nearest-neighbor 3D distance from H3K4me1-high spots to CBP-high spots, then average to one cell-level enhancer-coupling distance. Correlate that distance with Reln expression.\n\n### Expected result\nIf Reln-high cells have tighter enhancer coactivator organization, they should show shorter H3K4me1-to-CBP distances.\n\n### Validation checks\nConfirm required fields, enough cells and high-track spots, finite distance output, Spearman p-value, runtime, deterministic rerun, and a permutation control shuffling Reln labels.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'spots.trace_id', 'tracks.H3K4me1', 'tracks.CBP', 'cells.cell_type', 'linked_adata.X', 'linked_adata.var.Reln'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3', 'minimum_spot_or_trace_count_per_cell_for_H3K4me1_CBP_distance', 'finite_numeric_output', 'statistical_hypothesis_test_spearman_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation_by_shuffling_Reln_expression_across_cells'], 'expected_direction': 'Negative correlation: higher Reln expression should correspond to shorter H3K4me1-to-CBP nearest-neighbor distances.', 'complexity': 4, 'idea_id': 'reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e', 'metadata': {}})
PROJECT_ROOT = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd').exists()), Path('/Users/weizexu/Projects/U-Chrom'))
H5CD_PATH = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg'
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight inspection of alignment, fields, and finite coverage before the main test
import numpy as np
import pandas as pd

print('working_directory:', Path.cwd())
print('H5CD exists:', H5CD_PATH.exists(), H5CD_PATH)
print('cells shape:', cdata.cells.shape)
print('spots shape:', cdata.spots.shape)
print('coords shape:', np.asarray(cdata.coords).shape)
print('tracks shape:', cdata.tracks.shape if hasattr(cdata.tracks, 'shape') else type(cdata.tracks))
print('linked_adata shape:', None if adata is None else adata.shape)

cell_id_alignment = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
print('cell_id_alignment:', cell_id_alignment)
print('cell types:')
print(cdata.cells['cell_type'].value_counts().to_string())

track_preview = pd.DataFrame({
    'H3K4me1': np.asarray(cdata.tracks['H3K4me1'], dtype=float),
    'CBP': np.asarray(cdata.tracks['CBP'], dtype=float),
})
print('\nTrack finite coverage:')
print(track_preview.agg(['count', 'min', 'median', 'max']).to_string())

reln_idx = list(map(str, adata.var_names)).index('Reln')
reln_expr = np.asarray(adata.X[:, reln_idx].todense()).ravel() if hasattr(adata.X, 'todense') else np.asarray(adata.X[:, reln_idx]).ravel()
expr_preview = pd.DataFrame({'cell_id': cdata.cells.index.astype(str), 'cell_type': cdata.cells['cell_type'].astype(str).values, 'Reln': reln_expr})
print('\nReln expression by cell:')
print(expr_preview.to_string(index=False))

working_directory: /Users/weizexu/Projects/U-Chrom
H5CD exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd
cells shape: (9, 10)
spots shape: (56036, 6)
coords shape: (56036, 3)
tracks shape: (56036, 62)
linked_adata shape: (9, 60)
cell_id_alignment: True
cell types:
cell_type
Granule     3
Bergmann    3
Purkinje    3

Track finite coverage:
            H3K4me1          CBP
count   56036.00000  56036.00000
min        -1.73910     -1.46210
median      0.37417      0.24196
max        10.61560      5.48390

Reln expression by cell:
cell_id cell_type  Reln
 1_0_42   Granule   3.0
 1_0_47   Granule   7.0
 1_0_69   Granule   7.0
 1_0_34  Bergmann   1.0
 1_0_61  Bergmann   0.0
 1_0_63  Bergmann   3.0
 1_0_26  Purkinje   2.0
 1_0_37  Purkinje   1.0
1_0_116  Purkinje   6.0

# Main exploratory analysis: Reln expression vs H3K4me1-to-CBP nearest-neighbor distance
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.spatial import cKDTree
from scipy import stats
import json

rng = np.random.default_rng(20250220)
result_path = RUN_OUTPUT_DIR / 'reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e_result.csv'
figure_path = RUN_OUTPUT_DIR / 'reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e_statistical_summary.png'

spots = cdata.spots.copy()
coords = np.asarray(cdata.coords, dtype=float)
h3 = np.asarray(cdata.tracks['H3K4me1'], dtype=float)
cbp = np.asarray(cdata.tracks['CBP'], dtype=float)
spots = spots.assign(
    spot_index=np.arange(len(spots)),
    x=coords[:, 0], y=coords[:, 1], z=coords[:, 2],
    H3K4me1=h3, CBP=cbp,
)

# Expression aligned to ChromData cells (validated in inspection cell)
reln_idx = list(map(str, adata.var_names)).index('Reln')
reln_expr = np.asarray(adata.X[:, reln_idx].todense()).ravel() if hasattr(adata.X, 'todense') else np.asarray(adata.X[:, reln_idx]).ravel()
cell_meta = pd.DataFrame({
    'cell_id': cdata.cells.index.astype(str),
    'cell_type': cdata.cells['cell_type'].astype(str).values,
    'Reln_expression': reln_expr.astype(float),
})

# Top-quartile selection is performed within each trace to avoid global intensity/tracing confounding.
trace_rows = []
for trace_id, g in spots.groupby('trace_id', sort=False):
    if len(g) < 4:
        continue
    q_h3 = np.nanquantile(g['H3K4me1'].values, 0.75)
    q_cbp = np.nanquantile(g['CBP'].values, 0.75)
    h3_hi = g[np.isfinite(g['H3K4me1']) & (g['H3K4me1'] >= q_h3)]
    cbp_hi = g[np.isfinite(g['CBP']) & (g['CBP'] >= q_cbp)]
    if len(h3_hi) == 0 or len(cbp_hi) == 0:
        continue
    tree = cKDTree(cbp_hi[['x', 'y', 'z']].to_numpy(float))
    nn_dist, _ = tree.query(h3_hi[['x', 'y', 'z']].to_numpy(float), k=1)
    finite = nn_dist[np.isfinite(nn_dist)]
    if finite.size == 0:
        continue
    cell_id = str(g['cell_id'].iloc[0])
    trace_rows.append({
        'trace_id': trace_id,
        'cell_id': cell_id,
        'trace_n_spots': int(len(g)),
        'n_h3k4me1_high': int(len(h3_hi)),
        'n_cbp_high': int(len(cbp_hi)),
        'mean_nn_distance_um': float(np.mean(finite)),
        'median_nn_distance_um': float(np.median(finite)),
        'q75_h3k4me1': float(q_h3),
        'q75_cbp': float(q_cbp),
    })
trace_table = pd.DataFrame(trace_rows)

cell_distance = (
    trace_table.groupby('cell_id', as_index=False)
    .agg(
        mean_h3_to_cbp_nn_um=('mean_nn_distance_um', 'mean'),
        median_trace_nn_um=('median_nn_distance_um', 'median'),
        n_traces=('trace_id', 'nunique'),
        n_h3k4me1_high_spots=('n_h3k4me1_high', 'sum'),
        n_cbp_high_spots=('n_cbp_high', 'sum'),
    )
)
result_table = cell_meta.merge(cell_distance, on='cell_id', how='left')
result_table['finite_parameter'] = np.isfinite(result_table['mean_h3_to_cbp_nn_um'])
analysis_rows = result_table[result_table['finite_parameter']].copy()

null_hypothesis = 'Reln expression labels are exchangeable across cells with respect to per-cell mean H3K4me1-high to CBP-high nearest-neighbor distance; Spearman rho is 0 or not directionally negative.'
alternative_hypothesis = 'Higher Reln expression is associated with shorter H3K4me1-high to CBP-high nearest-neighbor distance across cells (negative Spearman association).'
test_method = 'Spearman correlation with 1000 fixed-seed Reln-label permutations (one-sided negative)'

n_perm = 1000
if len(analysis_rows) >= 4 and analysis_rows['Reln_expression'].nunique() >= 2 and analysis_rows['mean_h3_to_cbp_nn_um'].nunique() >= 2:
    x = analysis_rows['Reln_expression'].to_numpy(float)
    y = analysis_rows['mean_h3_to_cbp_nn_um'].to_numpy(float)
    rho, spearman_p_two_sided = stats.spearmanr(x, y)
    rho = float(rho)
    null_rhos = np.empty(n_perm, dtype=float)
    for i in range(n_perm):
        shuffled = rng.permutation(x)
        null_rhos[i] = stats.spearmanr(shuffled, y).statistic
    null_rhos = null_rhos[np.isfinite(null_rhos)]
    # One-sided p-value for the expected negative direction, with +1 correction.
    p_value = float((np.sum(null_rhos <= rho) + 1) / (len(null_rhos) + 1))
    status = 'pass'
    observed_statistic = rho
    effect_size = rho
    parameter_value = rho
else:
    rho = np.nan
    spearman_p_two_sided = np.nan
    null_rhos = np.array([], dtype=float)
    p_value = 1.0
    observed_statistic = 0.0
    effect_size = 0.0
    parameter_value = 0.0
    status = 'insufficient_data'

# Add the hypothesis-test columns required by verification to every result row.
result_table['observed_statistic_spearman_rho'] = float(observed_statistic)
result_table['effect_size'] = float(effect_size)
result_table['p_value'] = float(p_value)
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = status
result_table.to_csv(result_path, index=False)

# Statistical figure: cell-level scatter plus permutation null distribution.
fig, axes = plt.subplots(1, 2, figsize=(11, 4.5), facecolor='white')
ax = axes[0]
cell_types = list(result_table['cell_type'].dropna().unique())
colors = dict(zip(cell_types, plt.cm.Set2(np.linspace(0, 1, max(len(cell_types), 1)))))
for ct, g in result_table.groupby('cell_type'):
    ax.scatter(g['Reln_expression'], g['mean_h3_to_cbp_nn_um'], s=70, label=f'{ct} (n={len(g)})', color=colors.get(ct), edgecolor='black', linewidth=0.5)
if len(analysis_rows) >= 2:
    # Draw a visual trend line for orientation only (test remains Spearman/permutation).
    m, b = np.polyfit(analysis_rows['Reln_expression'], analysis_rows['mean_h3_to_cbp_nn_um'], 1)
    xs = np.linspace(analysis_rows['Reln_expression'].min(), analysis_rows['Reln_expression'].max(), 100)
    ax.plot(xs, m * xs + b, color='black', lw=1.5, label='linear guide')
ax.set_xlabel('Reln expression (linked RNA count)')
ax.set_ylabel('Mean H3K4me1-high to nearest CBP-high distance (µm)')
ax.set_title('Cell-level enhancer/coactivator distance')
ax.legend(frameon=False, fontsize=8)
ax.grid(alpha=0.2)

ax = axes[1]
if len(null_rhos):
    ax.hist(null_rhos, bins=25, color='0.78', edgecolor='white', label='Reln-label permutation null')
    ax.axvline(observed_statistic, color='crimson', lw=2, label=f'observed rho={observed_statistic:.3f}')
    ax.axvline(0, color='black', lw=1, linestyle='--', label='rho=0')
else:
    ax.text(0.5, 0.5, 'Insufficient data for permutation null', ha='center', va='center', transform=ax.transAxes)
ax.set_xlabel('Spearman rho under null')
ax.set_ylabel('Permutation count')
ax.set_title('Hypothesis-test evidence')
ax.legend(frameon=False, fontsize=8, loc='upper right')
annotation = f"method: Spearman + {len(null_rhos)} permutations\np(one-sided negative)={p_value:.4f}\neffect size rho={effect_size:.3f}\nn cells={len(analysis_rows)}"
ax.text(0.03, 0.05, annotation, transform=ax.transAxes, ha='left', va='bottom', fontsize=9, bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.7', alpha=0.95))
ax.grid(alpha=0.2)
fig.suptitle('Reln expression vs spatial coupling of H3K4me1 and CBP chromatin spots', y=1.02, fontsize=12)
fig.tight_layout()
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
display(fig)
plt.close(fig)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'n_selected_cells': int(len(analysis_rows)),
    'n_rows': int(len(result_table)),
    'n_trace_measurements': int(len(trace_table)),
    'parameter_value': float(parameter_value),
    'observed_statistic': float(observed_statistic),
    'effect_size': float(effect_size),
    'p_value': float(p_value),
    'spearman_p_two_sided_scipy': None if not np.isfinite(spearman_p_two_sided) else float(spearman_p_two_sided),
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': status,
    'expected_direction': 'negative',
    'permutation_count': int(len(null_rhos)),
    'result_path': str(result_path.relative_to(PROJECT_ROOT)),
    'figure_path': str(figure_path.relative_to(PROJECT_ROOT)),
    'notes': [
        'Top-quartile H3K4me1 and CBP spots selected within each trace.',
        'Distances are Euclidean 3D nearest-neighbor distances in micrometers, averaged per cell.',
        'Small n=9 cell subset; result is exploratory and permutation-based inference is used as a bounded negative control.'
    ],
}

print(json.dumps(analysis_summary, indent=2))
print('\nResult table:')
display(result_table)

Figure(1100x450)
{
  "idea_id": "reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e",
  "n_selected_cells": 9,
  "n_rows": 9,
  "n_trace_measurements": 213,
  "parameter_value": -0.10127393670836665,
  "observed_statistic": -0.10127393670836665,
  "effect_size": -0.10127393670836665,
  "p_value": 0.4155844155844156,
  "spearman_p_two_sided_scipy": 0.7954416654795309,
  "test_method": "Spearman correlation with 1000 fixed-seed Reln-label permutations (one-sided negative)",
  "null_hypothesis": "Reln expression labels are exchangeable across cells with respect to per-cell mean H3K4me1-high to CBP-high nearest-neighbor distance; Spearman rho is 0 or not directionally negative.",
  "alternative_hypothesis": "Higher Reln expression is associated with shorter H3K4me1-high to CBP-high nearest-neighbor distance across cells (negative Spearman association).",
  "hypothesis_test_status": "pass",
  "expected_direction": "negative",
  "permutation_count": 1000,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e_result.csv",
  "figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e_statistical_summary.png",
  "notes": [
    "Top-quartile H3K4me1 and CBP spots selected within each trace.",
    "Distances are Euclidean 3D nearest-neighbor distances in micrometers, averaged per cell.",
    "Small n=9 cell subset; result is exploratory and permutation-based inference is used as a bounded negative control."
  ]
}

Result table:
   cell_id  ... hypothesis_test_status
0   1_0_42  ...                   pass
1   1_0_47  ...                   pass
2   1_0_69  ...                   pass
3   1_0_34  ...                   pass
4   1_0_61  ...                   pass
5   1_0_63  ...                   pass
6   1_0_26  ...                   pass
7   1_0_37  ...                   pass
8  1_0_116  ...                   pass

[9 rows x 14 columns]

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e.ipynb:37: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  },

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3": "not_run",
    "minimum_spot_or_trace_count_per_cell_for_H3K4me1_CBP_distance": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_spearman_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation_by_shuffling_Reln_expression_across_cells": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.10127393670836665,
  "p_value": 0.4155844155844156,
  "test_method": "Spearman correlation with 1000 fixed-seed Reln-label permutations (one-sided negative)",
  "effect_size": -0.10127393670836665,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/reln-expression-predicts-spatial-coupling-of-h3k-ec2890df4e_result.csv",
  "notes": [
    "Top-quartile H3K4me1 and CBP spots selected within each trace.",
    "Distances are Euclidean 3D nearest-neighbor distances in micrometers, averaged per cell.",
    "Small n=9 cell subset; result is exploratory and permutation-based inference is used as a bounded negative control."
  ]
}

Auto-discovery idea: Reln expression predicts spatial coupling of H3K4me1 and CBP chromatin spots¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶