# Ensure relative scaffold paths resolve from the project workspace.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Purkinje marker expression predicts chromosome-wide radial positioning', 'biological_hypothesis': 'Cells with higher Purkinje marker Pcp2 expression have systematically shifted genome-wide radial chromatin positioning.', 'computable_parameter': 'Spearman rho across cells between linked_adata.X expression of linked_adata.var.Pcp2 and per-cell mean tracks.n_rad_score.', 'analysis_plan': 'Align cells.index, spots.cell_id, and linked_adata.obs_names. For each cell_id, average finite tracks.n_rad_score over all spots. Extract Pcp2 expression from linked_adata.X using linked_adata.var.Pcp2. Compute Spearman rho across cells and evaluate significance with an exact or permutation test by shuffling Pcp2 expression across cell IDs.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata', 'rna_expression'], 'idea_markdown': '### Rationale\nCell identity programs may be coupled to global nuclear architecture, with Purkinje-marker expression linked to radial repositioning of chromatin.\n\n### Data used\nUse linked RNA expression for `Pcp2`, spot-level radial score, cell IDs, and cell type annotations.\n\n### Analysis sketch\nFor each cell, compute the mean chromatin `n_rad_score` across all finite spots. Correlate this cell-level radial summary with linked `Pcp2` expression across the 9 matched cells.\n\n### Expected result\nA significant association would suggest that Purkinje transcriptional identity is accompanied by systematic nuclear-position differences.\n\n### Validation checks\nVerify RNA/chromatin cell alignment, required fields, enough cells, finite correlation, exact p-value or permutation p-value, runtime, deterministic rerun, and a negative control using permuted cell labels.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['spots.cell_id', 'tracks.n_rad_score', 'cells.cell_type', 'linked_adata.X', 'linked_adata.var.Pcp2'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_at_least_9_total', 'minimum_spot_or_trace_count_at_least_1000_finite_spots', 'finite_numeric_output', 'statistical_hypothesis_test_spearman_exact_or_permutation_p_value', 'runtime_under_budget_5_minutes', 'deterministic_rerun_fixed_seed', 'negative_control_or_permutation_shuffle_Pcp2_across_cells'], 'expected_direction': 'A nonzero rho; direction indicates whether Pcp2-high Purkinje-like cells have more interior or more peripheral average chromatin radial scores.', 'complexity': 3, 'idea_id': 'purkinje-marker-expression-predicts-chromosome-w-ed79327c32', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

purkinje-marker-expression-predicts-chromosome-w-ed79327c32
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection: alignment and finite radial-score coverage.
import numpy as np
import pandas as pd

print('cells index:', list(cdata.cells.index))
print('linked_adata obs_names:', list(adata.obs_names))
print('Pcp2 in linked_adata.var_names:', 'Pcp2' in list(adata.var_names))
print('spot columns:', list(cdata.spots.columns)[:10])
print('available track keys include n_rad_score:', 'n_rad_score' in cdata.tracks)

rad = np.asarray(cdata.tracks['n_rad_score'], dtype=float)
spot_cell = pd.Series(cdata.spots['cell_id']).astype(str)
coverage = pd.DataFrame({'cell_id': spot_cell, 'n_rad_score': rad}).groupby('cell_id').agg(
    finite_spots=('n_rad_score', lambda x: int(np.isfinite(x).sum())),
    total_spots=('n_rad_score', 'size'),
    mean_n_rad_score=('n_rad_score', lambda x: float(np.nanmean(np.asarray(x, dtype=float))))
)
coverage['finite_fraction'] = coverage['finite_spots'] / coverage['total_spots']
print('finite n_rad_score spots:', int(np.isfinite(rad).sum()), 'of', rad.size)
display(coverage.join(cdata.cells[['cell_type']], how='left'))

cells index: ['1_0_42', '1_0_47', '1_0_69', '1_0_34', '1_0_61', '1_0_63', '1_0_26', '1_0_37', '1_0_116']
linked_adata obs_names: ['1_0_42', '1_0_47', '1_0_69', '1_0_34', '1_0_61', '1_0_63', '1_0_26', '1_0_37', '1_0_116']
Pcp2 in linked_adata.var_names: True
spot columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
available track keys include n_rad_score: True
finite n_rad_score spots: 56036 of 56036
         finite_spots  total_spots  mean_n_rad_score  finite_fraction cell_type
cell_id                                                                        
1_0_116         11659        11659          0.716611              1.0  Purkinje
1_0_26           4225         4225          0.698348              1.0  Purkinje
1_0_34           3932         3932          0.751676              1.0  Bergmann
1_0_37           5238         5238          0.722048              1.0  Purkinje
1_0_42           4183         4183          0.752533              1.0   Granule
1_0_47           4682         4682          0.745255              1.0   Granule
1_0_61          11283        11283          0.737418              1.0  Bergmann
1_0_63           7614         7614          0.725035              1.0  Bergmann
1_0_69           3220         3220          0.751632              1.0   Granule

# Main exploration: test whether Pcp2 expression predicts per-cell mean radial chromatin score.
import os
os.environ.setdefault("MPLBACKEND", "Agg")
import matplotlib
matplotlib.use("Agg", force=True)
import matplotlib.pyplot as plt
from pathlib import Path
import itertools
import json
import numpy as np
import pandas as pd
from IPython.display import display, Image

rng = np.random.default_rng(20250609)
result_path = RUN_OUTPUT_DIR / 'purkinje-marker-expression-predicts-chromosome-w-ed79327c32_result.csv'
figure_path = RUN_OUTPUT_DIR / 'purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png'

# Extract per-cell chromatin radial summary.
radial = np.asarray(cdata.tracks['n_rad_score'], dtype=float)
spot_cells = pd.Series(cdata.spots['cell_id']).astype(str).to_numpy()
radial_df = pd.DataFrame({'cell_id': spot_cells, 'n_rad_score': radial})
cell_radial = radial_df.groupby('cell_id', sort=False).agg(
    mean_n_rad_score=('n_rad_score', lambda x: float(np.nanmean(np.asarray(x, dtype=float)))),
    median_n_rad_score=('n_rad_score', lambda x: float(np.nanmedian(np.asarray(x, dtype=float)))),
    finite_spots=('n_rad_score', lambda x: int(np.isfinite(np.asarray(x, dtype=float)).sum())),
    total_spots=('n_rad_score', 'size'),
).reset_index()
cell_radial['finite_fraction'] = cell_radial['finite_spots'] / cell_radial['total_spots']

# Extract linked Pcp2 expression and align by cell_id.
if 'Pcp2' not in list(adata.var_names):
    raise KeyError('Pcp2 not found in linked_adata.var_names')
pcp2_idx = list(adata.var_names).index('Pcp2')
X_col = adata.X[:, pcp2_idx]
if hasattr(X_col, 'toarray'):
    pcp2_expr = np.asarray(X_col.toarray()).ravel().astype(float)
else:
    pcp2_expr = np.asarray(X_col).ravel().astype(float)
expr_df = pd.DataFrame({'cell_id': list(map(str, adata.obs_names)), 'Pcp2_expression': pcp2_expr})
cell_meta = cdata.cells[['cell_type']].copy().reset_index().rename(columns={'index': 'cell_id'})
cell_meta['cell_id'] = cell_meta['cell_id'].astype(str)

result_table = (cell_meta.merge(cell_radial, on='cell_id', how='left')
                .merge(expr_df, on='cell_id', how='left'))
result_table['aligned_to_linked_adata_order'] = result_table['cell_id'].tolist() == list(map(str, adata.obs_names))
valid_mask = np.isfinite(result_table['Pcp2_expression'].to_numpy()) & np.isfinite(result_table['mean_n_rad_score'].to_numpy())
valid = result_table.loc[valid_mask].copy()
n_cells = int(len(valid))
n_finite_spots = int(valid['finite_spots'].sum()) if n_cells else 0

null_hypothesis = 'Across matched cells, Pcp2 expression is exchangeable with respect to per-cell mean n_rad_score; Spearman rho equals 0.'
alternative_hypothesis = 'Pcp2 expression is monotonically associated with per-cell mean n_rad_score (two-sided nonzero Spearman rho).'
test_method = 'Spearman rank correlation with 1000 reproducible cell-label permutations'

def spearman_rho(x, y):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if len(x) < 2 or np.nanstd(x) == 0 or np.nanstd(y) == 0:
        return np.nan
    xr = pd.Series(x).rank(method='average').to_numpy(dtype=float)
    yr = pd.Series(y).rank(method='average').to_numpy(dtype=float)
    return float(np.corrcoef(xr, yr)[0, 1])

if n_cells >= 3 and np.nanstd(valid['Pcp2_expression']) > 0 and np.nanstd(valid['mean_n_rad_score']) > 0:
    x = valid['Pcp2_expression'].to_numpy(dtype=float)
    y = valid['mean_n_rad_score'].to_numpy(dtype=float)
    observed_rho = spearman_rho(x, y)
    n_permutations = 1000
    null_rhos = np.empty(n_permutations, dtype=float)
    for i in range(n_permutations):
        null_rhos[i] = spearman_rho(rng.permutation(x), y)
    p_value = float((np.sum(np.abs(null_rhos) >= abs(observed_rho)) + 1) / (n_permutations + 1))
    effect_size = float(observed_rho)
    hypothesis_test_status = 'pass'
    test_note = 'Permutation null generated by shuffling Pcp2 expression among the 9 fixed chromatin cells.'
else:
    x = valid['Pcp2_expression'].to_numpy(dtype=float) if n_cells else np.array([])
    y = valid['mean_n_rad_score'].to_numpy(dtype=float) if n_cells else np.array([])
    observed_rho = spearman_rho(x, y) if n_cells >= 2 else 0.0
    if not np.isfinite(observed_rho):
        observed_rho = 0.0
    null_rhos = np.array([0.0])
    p_value = 1.0
    effect_size = float(observed_rho)
    hypothesis_test_status = 'insufficient_data'
    n_permutations = 0
    test_note = 'Insufficient nonconstant finite cell-level values for permutation testing.'

result_table['observed_statistic'] = float(observed_rho)
result_table['effect_size'] = float(effect_size)
result_table['p_value'] = float(p_value)
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table['null_hypothesis'] = null_hypothesis
result_table['alternative_hypothesis'] = alternative_hypothesis
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'result_path': str(result_path),
    'statistical_figure_path': str(figure_path),
    'n_selected_cells': n_cells,
    'n_rows': n_finite_spots,
    'n_permutations': int(n_permutations),
    'parameter_value': float(observed_rho),
    'observed_statistic': float(observed_rho),
    'effect_size': float(effect_size),
    'p_value': float(p_value),
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'finite_spots_total': n_finite_spots,
    'cell_id_alignment_exact': bool(list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))),
    'notes': [test_note, 'Effect size is Spearman rho; radial score is the per-cell mean across all finite spots.']
}

# Statistical figure: cell-level association plus permutation null distribution.
plt.close('all')
fig, axes = plt.subplots(1, 2, figsize=(10.5, 4.2), facecolor='white')
colors = {'Granule': '#4C78A8', 'Bergmann': '#F58518', 'Purkinje': '#54A24B'}
for ct, sub in valid.groupby('cell_type'):
    axes[0].scatter(sub['Pcp2_expression'], sub['mean_n_rad_score'], s=70,
                    label=f'{ct} (n={len(sub)})', color=colors.get(ct, '#777777'),
                    edgecolor='black', linewidth=0.5, alpha=0.95)
if n_cells >= 2 and np.nanstd(x) > 0:
    coef = np.polyfit(x, y, deg=1)
    xx = np.linspace(float(np.min(x)), float(np.max(x)), 100)
    axes[0].plot(xx, coef[0] * xx + coef[1], color='black', lw=1.3, label='linear guide')
axes[0].set_xlabel('Linked RNA Pcp2 expression (a.u.)')
axes[0].set_ylabel('Mean spot n_rad_score per cell (unitless)')
axes[0].set_title('Cell-level Pcp2 vs radial chromatin score')
axes[0].legend(frameon=False, fontsize=8)
axes[0].grid(True, alpha=0.25)

bins = np.linspace(-1, 1, 31)
axes[1].hist(null_rhos, bins=bins, color='#B0B0B0', edgecolor='white', label='shuffled-label null')
axes[1].axvline(observed_rho, color='#D62728', lw=2.5, label=f'observed rho={observed_rho:.3f}')
axes[1].axvline(-abs(observed_rho), color='#D62728', lw=1.2, ls='--', alpha=0.7)
axes[1].axvline(abs(observed_rho), color='#D62728', lw=1.2, ls='--', alpha=0.7)
axes[1].set_xlabel('Spearman rho after Pcp2 label shuffle')
axes[1].set_ylabel('Permutation count')
axes[1].set_title('Permutation test evidence')
axes[1].legend(frameon=False, fontsize=8)
axes[1].grid(True, alpha=0.25)
annotation = f"two-sided p={p_value:.4f}\neffect size rho={effect_size:.3f}\nn={n_cells} cells; {n_finite_spots:,} spots\n{test_method}"
axes[1].text(0.03, 0.97, annotation, transform=axes[1].transAxes, va='top', ha='left',
             fontsize=8.5, bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#CCCCCC', alpha=0.95))
fig.suptitle('Purkinje marker expression and chromosome-wide radial positioning', y=1.02, fontsize=12)
fig.tight_layout()
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
display(fig)
plt.close(fig)
display(Image(filename=str(figure_path)))

print(json.dumps(analysis_summary, indent=2))
display(result_table)

Figure(1050x420)
<IPython.core.display.Image object>
{
  "idea_id": "purkinje-marker-expression-predicts-chromosome-w-ed79327c32",
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png",
  "n_selected_cells": 9,
  "n_rows": 56036,
  "n_permutations": 1000,
  "parameter_value": -0.8619322310183947,
  "observed_statistic": -0.8619322310183947,
  "effect_size": -0.8619322310183947,
  "p_value": 0.001998001998001998,
  "test_method": "Spearman rank correlation with 1000 reproducible cell-label permutations",
  "null_hypothesis": "Across matched cells, Pcp2 expression is exchangeable with respect to per-cell mean n_rad_score; Spearman rho equals 0.",
  "alternative_hypothesis": "Pcp2 expression is monotonically associated with per-cell mean n_rad_score (two-sided nonzero Spearman rho).",
  "hypothesis_test_status": "pass",
  "finite_spots_total": 56036,
  "cell_id_alignment_exact": true,
  "notes": [
    "Permutation null generated by shuffling Pcp2 expression among the 9 fixed chromatin cells.",
    "Effect size is Spearman rho; radial score is the per-cell mean across all finite spots."
  ]
}
   cell_id  ...                             alternative_hypothesis
0   1_0_42  ...  Pcp2 expression is monotonically associated wi...
1   1_0_47  ...  Pcp2 expression is monotonically associated wi...
2   1_0_69  ...  Pcp2 expression is monotonically associated wi...
3   1_0_34  ...  Pcp2 expression is monotonically associated wi...
4   1_0_61  ...  Pcp2 expression is monotonically associated wi...
5   1_0_63  ...  Pcp2 expression is monotonically associated wi...
6   1_0_26  ...  Pcp2 expression is monotonically associated wi...
7   1_0_37  ...  Pcp2 expression is monotonically associated wi...
8  1_0_116  ...  Pcp2 expression is monotonically associated wi...

[9 rows x 16 columns]

# Ensure required relative output paths exist from the workspace filesystem for runner/QA tools.
from pathlib import Path
import shutil, os
workspace_dir = Path('/Users/weizexu/Projects/U-Chrom')
required_dir = workspace_dir / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg'
required_dir.mkdir(parents=True, exist_ok=True)
required_figure = required_dir / 'purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png'
required_result = required_dir / 'purkinje-marker-expression-predicts-chromosome-w-ed79327c32_result.csv'
# If previous relative paths landed elsewhere, copy them into the required workspace-relative location.
for candidate in [Path('purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png'), Path('/Users/weizexu/Projects/U-Chrom/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png'), Path(analysis_summary['statistical_figure_path'])]:
    if candidate.exists() and candidate.resolve() != required_figure.resolve():
        shutil.copy2(candidate, required_figure)
        break
if not required_figure.exists() and Path(analysis_summary['statistical_figure_path']).exists():
    shutil.copy2(Path(analysis_summary['statistical_figure_path']), required_figure)
if not required_result.exists() and Path(analysis_summary['result_path']).exists():
    shutil.copy2(Path(analysis_summary['result_path']), required_result)
analysis_summary['statistical_figure_path'] = str(required_figure.relative_to(workspace_dir))
analysis_summary['result_path'] = str(required_result.relative_to(workspace_dir))
print('cwd', os.getcwd())
print('figure exists', required_figure.exists(), required_figure, required_figure.stat().st_size if required_figure.exists() else None)
print('result exists', required_result.exists(), required_result, required_result.stat().st_size if required_result.exists() else None)
print(analysis_summary['statistical_figure_path'])

cwd /Users/weizexu/Projects/U-Chrom
figure exists True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png 129669
result exists True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_result.csv 4283
tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_statistical_summary.png

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "purkinje-marker-expression-predicts-chromosome-w-ed79327c32",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_at_least_9_total": "not_run",
    "minimum_spot_or_trace_count_at_least_1000_finite_spots": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_spearman_exact_or_permutation_p_value": "not_run",
    "runtime_under_budget_5_minutes": "not_run",
    "deterministic_rerun_fixed_seed": "not_run",
    "negative_control_or_permutation_shuffle_Pcp2_across_cells": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.8619322310183947,
  "p_value": 0.001998001998001998,
  "test_method": "Spearman rank correlation with 1000 reproducible cell-label permutations",
  "effect_size": -0.8619322310183947,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/purkinje-marker-expression-predicts-chromosome-w-ed79327c32_result.csv",
  "notes": [
    "Permutation null generated by shuffling Pcp2 expression among the 9 fixed chromatin cells.",
    "Effect size is Spearman rho; radial score is the per-cell mean across all finite spots."
  ]
}

Auto-discovery idea: Purkinje marker expression predicts chromosome-wide radial positioning¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶