# Ensure relative data paths resolve from the repository root when the notebook kernel starts in notebooks/.
from pathlib import Path
import os
root = Path('/Users/weizexu/Projects/U-Chrom')
os.chdir(root)
print('cwd', Path.cwd())

cwd /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'RNAPIISer2-P neighborhoods around polyA_RNA spots should survive cell-label negative controls', 'biological_hypothesis': 'High polyA_RNA chromatin-associated spots are spatially closer to high RNAPIISer2-P spots within the same cell than expected under a cell-label permutation negative control.', 'computable_parameter': 'polyA_to_RNAPIISer2P_nn_um: median same-cell nearest-neighbor 3D distance from top-quartile polyA_RNA spots to top-quartile RNAPIISer2-P spots.', 'analysis_plan': 'Within each cell, identify top-quartile tracks.polyA_RNA spots and top-quartile tracks.RNAPIISer2-P spots. Compute the nearest 3D distance from each selected polyA_RNA spot to any selected RNAPIISer2-P spot in the same cell using coords, then aggregate the median across cells. Test whether the observed parameter is smaller than a null distribution generated by permuting RNAPIISer2-P spot cell labels while preserving global signal ranks.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nTranscriptionally engaged nuclear regions should place mature RNA signal near elongating polymerase signal. A robustness hypothesis is that this proximity should disappear when cell identities are permuted.\n\n### Data used\nUse 3D coordinates, cell IDs, polyA_RNA track values, and RNAPIISer2-P track values for all available cell types.\n\n### Analysis sketch\nSelect high polyA_RNA spots and high RNAPIISer2-P spots within each cell. The single parameter is the median nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots within the same cell.\n\n### Expected result\nThe observed nearest-neighbor distance should be smaller than distances obtained after permuting cell labels for RNAPIISer2-P spots.\n\n### Validation checks\nCheck required fields, enough cells and high-signal spots, finite distance output, a permutation p-value, deterministic seeded rerun, runtime, and a negative control using cell-label permutation.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'tracks.polyA_RNA', 'tracks.RNAPIISer2-P', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'polyA_to_RNAPIISer2P_nn_um should be lower than the cell-label-permuted null distribution.', 'complexity': 3, 'idea_id': 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection: verify required columns/tracks and spot-level coverage.
import numpy as np
import pandas as pd

required_spot_cols = ['cell_id']
required_tracks = ['polyA_RNA', 'RNAPIISer2-P']
print('spots shape:', cdata.spots.shape)
print('spots columns:', list(cdata.spots.columns)[:12])
print('cells shape:', cdata.cells.shape)
print('cell type counts:', cdata.cells['cell_type'].value_counts().to_dict())
print('tracks shape:', cdata.tracks.shape)
print('has required spot cols:', {col: col in cdata.spots.columns for col in required_spot_cols})
print('has required tracks:', {trk: trk in cdata.tracks.columns for trk in required_tracks})
print('spots index preview:', cdata.spots.head(3).to_dict('records'))
print('coords attr exists:', hasattr(cdata, 'coords'))
if hasattr(cdata, 'coords'):
    coords_arr = np.asarray(cdata.coords)
    print('coords shape:', coords_arr.shape, 'finite fraction:', float(np.isfinite(coords_arr).mean()))
coverage = pd.DataFrame({
    'track': required_tracks,
    'finite_fraction': [float(np.isfinite(cdata.tracks[t]).mean()) for t in required_tracks],
    'q75': [float(np.nanquantile(cdata.tracks[t], 0.75)) for t in required_tracks],
    'nonzero_fraction': [float((cdata.tracks[t].fillna(0) > 0).mean()) for t in required_tracks],
})
display(coverage)

spots shape: (56036, 6)
spots columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells shape: (9, 10)
cell type counts: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
tracks shape: (56036, 62)
has required spot cols: {'cell_id': True}
has required tracks: {'polyA_RNA': True, 'RNAPIISer2-P': True}
spots index preview: [{'chrom': 'chr14', 'start': 30425000, 'end': 30450000, 'trace_id': '1_0_61_chr14_a2', 'cell_id': '1_0_61', 'name': 'chr14-1096'}, {'chrom': 'chr2', 'start': 99725000, 'end': 99750000, 'trace_id': '1_0_61_chr2_a2', 'cell_id': '1_0_61', 'name': 'chr2-3864'}, {'chrom': 'chr14', 'start': 39625000, 'end': 39650000, 'trace_id': '1_0_61_chr14_a1', 'cell_id': '1_0_61', 'name': 'chr14-1464'}]
coords attr exists: True
coords shape: (56036, 3) finite fraction: 1.0
          track  finite_fraction      q75  nonzero_fraction
0     polyA_RNA              1.0  0.38913          0.357431
1  RNAPIISer2-P              1.0  0.76854          0.510832

# Main compact exploration: top-quartile polyA_RNA to RNAPIISer2-P nearest-neighbor distance
# with a seeded cell-label permutation negative control.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
import numpy as np
import pandas as pd
import json
from pathlib import Path

rng = np.random.default_rng(8675309)
result_path = RUN_OUTPUT_DIR / 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv'
fig_path = RUN_OUTPUT_DIR / 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png'
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

coords = np.asarray(cdata.coords, dtype=float)
spots = cdata.spots.reset_index(drop=True).copy()
tracks = cdata.tracks.reset_index(drop=True)
spots['polyA_RNA'] = pd.to_numeric(tracks['polyA_RNA'], errors='coerce')
spots['RNAPIISer2_P'] = pd.to_numeric(tracks['RNAPIISer2-P'], errors='coerce')
spots[['x_um', 'y_um', 'z_um']] = coords
cell_type_map = cdata.cells['cell_type'].astype(str).to_dict()
spots['cell_type'] = spots['cell_id'].map(cell_type_map)

# Top quartile is computed within each cell. To keep pairwise distance work bounded,
# sample at most this many high spots per marker per cell after thresholding.
max_high_spots_per_cell = 400
cell_ids = sorted(spots['cell_id'].dropna().astype(str).unique())
selected_polyA = {}
selected_rnap = {}
cell_rows = []

def _sample_indices(indices, max_n):
    indices = np.asarray(indices, dtype=int)
    if len(indices) > max_n:
        return np.sort(rng.choice(indices, size=max_n, replace=False))
    return np.sort(indices)

def _nn_median(poly_coords, rnap_coords):
    if len(poly_coords) == 0 or len(rnap_coords) == 0:
        return np.nan
    # Chunk polyA queries to avoid a large all-vs-all allocation.
    mins = []
    chunk = 200
    for start in range(0, len(poly_coords), chunk):
        p = poly_coords[start:start+chunk]
        d2 = ((p[:, None, :] - rnap_coords[None, :, :]) ** 2).sum(axis=2)
        mins.append(np.sqrt(d2.min(axis=1)))
    return float(np.median(np.concatenate(mins)))

for cid in cell_ids:
    mask = (spots['cell_id'].astype(str).values == cid)
    idx = np.flatnonzero(mask)
    sub = spots.iloc[idx]
    poly_q75 = float(np.nanquantile(sub['polyA_RNA'], 0.75))
    rnap_q75 = float(np.nanquantile(sub['RNAPIISer2_P'], 0.75))
    poly_idx_all = idx[np.flatnonzero(np.isfinite(sub['polyA_RNA'].values) & (sub['polyA_RNA'].values >= poly_q75))]
    rnap_idx_all = idx[np.flatnonzero(np.isfinite(sub['RNAPIISer2_P'].values) & (sub['RNAPIISer2_P'].values >= rnap_q75))]
    poly_idx = _sample_indices(poly_idx_all, max_high_spots_per_cell)
    rnap_idx = _sample_indices(rnap_idx_all, max_high_spots_per_cell)
    selected_polyA[cid] = poly_idx
    selected_rnap[cid] = rnap_idx
    med_nn = _nn_median(coords[poly_idx], coords[rnap_idx])
    cell_rows.append({
        'cell_id': cid,
        'cell_type': str(cell_type_map.get(cid, 'unknown')),
        'n_polyA_high_all': int(len(poly_idx_all)),
        'n_RNAPIISer2P_high_all': int(len(rnap_idx_all)),
        'n_polyA_high_sampled': int(len(poly_idx)),
        'n_RNAPIISer2P_high_sampled': int(len(rnap_idx)),
        'polyA_q75': poly_q75,
        'RNAPIISer2P_q75': rnap_q75,
        'median_polyA_to_RNAPIISer2P_nn_um': med_nn,
    })

per_cell = pd.DataFrame(cell_rows)
valid_cell_medians = per_cell['median_polyA_to_RNAPIISer2P_nn_um'].replace([np.inf, -np.inf], np.nan).dropna().values
observed_statistic = float(np.median(valid_cell_medians)) if len(valid_cell_medians) else np.nan

# Permutation null: pool the sampled high RNAPIISer2-P spots and shuffle their cell labels while
# preserving the number of RNAPII spots assigned to each cell. polyA cell labels remain fixed.
rnap_pool = np.concatenate([selected_rnap[cid] for cid in cell_ids if len(selected_rnap[cid])])
rnap_counts = {cid: len(selected_rnap[cid]) for cid in cell_ids}
n_permutations = 500
null_stats = []
if len(valid_cell_medians) >= 2 and len(rnap_pool) > 0 and all(len(selected_polyA[cid]) > 0 for cid in cell_ids):
    for _ in range(n_permutations):
        shuffled_pool = rng.permutation(rnap_pool)
        cursor = 0
        perm_meds = []
        for cid in cell_ids:
            n_r = rnap_counts[cid]
            assigned_rnap = shuffled_pool[cursor:cursor+n_r]
            cursor += n_r
            if n_r > 0 and len(selected_polyA[cid]) > 0:
                perm_meds.append(_nn_median(coords[selected_polyA[cid]], coords[assigned_rnap]))
        perm_meds = np.asarray(perm_meds, dtype=float)
        perm_meds = perm_meds[np.isfinite(perm_meds)]
        if len(perm_meds):
            null_stats.append(float(np.median(perm_meds)))
null_stats = np.asarray(null_stats, dtype=float)

if len(null_stats) >= 100 and np.isfinite(observed_statistic):
    # One-sided test: observed proximity is smaller than permuted-cell null.
    p_value = float((1 + np.sum(null_stats <= observed_statistic)) / (len(null_stats) + 1))
    null_median = float(np.median(null_stats))
    effect_size = float(null_median - observed_statistic)  # positive means observed is closer than null
    hypothesis_test_status = 'pass'
    test_note = 'Seeded one-sided cell-label permutation test completed.'
else:
    p_value = 1.0
    null_median = float(np.nanmedian(null_stats)) if len(null_stats) else np.nan
    effect_size = float(null_median - observed_statistic) if np.isfinite(null_median) and np.isfinite(observed_statistic) else 0.0
    hypothesis_test_status = 'insufficient_data'
    test_note = 'Too few finite cell medians or permutation draws for a formal test.'

test_method = f'one-sided cell-label permutation test ({len(null_stats)} permutations)'
null_hypothesis = 'The median same-cell nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots is no smaller than under permuted RNAPIISer2-P cell labels.'
alternative_hypothesis = 'The observed median same-cell nearest-neighbor distance is smaller than the cell-label-permuted null distribution.'

# Add required test columns to every result row and write the compact table.
result_table = per_cell.copy()
result_table['observed_statistic_um'] = observed_statistic
result_table['null_median_um'] = null_median
result_table['effect_size_um_null_minus_observed'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'polyA_to_RNAPIISer2P_nn_um',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_selected_cells': int(result_table['cell_id'].nunique()),
    'n_rows': int(len(result_table)),
    'n_polyA_high_sampled_total': int(result_table['n_polyA_high_sampled'].sum()),
    'n_RNAPIISer2P_high_sampled_total': int(result_table['n_RNAPIISer2P_high_sampled'].sum()),
    'n_permutations_requested': int(n_permutations),
    'n_null_permutations_finite': int(len(null_stats)),
    'max_high_spots_per_cell': int(max_high_spots_per_cell),
    'negative_control': 'RNAPIISer2-P high-spot cell labels were permuted across cells while preserving per-cell RNAPII counts.',
    'result_path': str(result_path),
    'figure_path': str(fig_path),
    'notes': [test_note, 'Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime.'],
}

# Statistical figure: null distribution with observed statistic, plus per-cell observed medians.
plt.rcParams.update({'figure.facecolor': 'white', 'axes.facecolor': 'white', 'font.size': 10})
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), constrained_layout=True)
ax = axes[0]
if len(null_stats):
    ax.hist(null_stats, bins=25, color='#bdbdbd', edgecolor='white', label='Permuted RNAPIISer2-P cell labels')
ax.axvline(observed_statistic, color='#b2182b', linewidth=2.5, label='Observed')
ax.axvline(null_median, color='#2166ac', linewidth=2, linestyle='--', label='Null median')
ax.set_xlabel('Median per-cell nearest-neighbor distance (µm)')
ax.set_ylabel('Permutation count')
ax.set_title('Hypothesis-test evidence')
ax.legend(frameon=False, fontsize=8)
ax.text(0.03, 0.97, f'p = {p_value:.4f}\neffect = {effect_size:.3f} µm\nn = {len(result_table)} cells\n{test_method}',
        transform=ax.transAxes, va='top', ha='left', fontsize=8,
        bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#cccccc', alpha=0.9))

ax2 = axes[1]
order = result_table.sort_values(['cell_type', 'cell_id']).reset_index(drop=True)
colors = {'Granule': '#1b9e77', 'Bergmann': '#d95f02', 'Purkinje': '#7570b3'}
bar_colors = [colors.get(ct, '#666666') for ct in order['cell_type']]
ax2.bar(np.arange(len(order)), order['median_polyA_to_RNAPIISer2P_nn_um'], color=bar_colors, edgecolor='black', linewidth=0.4)
ax2.axhline(observed_statistic, color='#b2182b', linestyle='-', linewidth=1.5, label='Observed median')
ax2.axhline(null_median, color='#2166ac', linestyle='--', linewidth=1.5, label='Null median')
ax2.set_xticks(np.arange(len(order)))
ax2.set_xticklabels(order['cell_type'] + '\n' + order['cell_id'].astype(str), rotation=45, ha='right', fontsize=7)
ax2.set_ylabel('Nearest-neighbor distance (µm)')
ax2.set_title('Per-cell high polyA → high RNAPIISer2-P')
ax2.legend(frameon=False, fontsize=8)
for spine_ax in axes:
    spine_ax.spines['top'].set_visible(False)
    spine_ax.spines['right'].set_visible(False)
fig.suptitle('High polyA_RNA spots are compared with high RNAPIISer2-P neighborhoods', fontsize=12)
fig.savefig(fig_path, dpi=200, bbox_inches='tight')
plt.show()
display(Image(filename=str(fig_path)))

display(result_table)
print(json.dumps(analysis_summary, indent=2))

<IPython.core.display.Image object>
   cell_id  ... hypothesis_test_status
0  1_0_116  ...                   pass
1   1_0_26  ...                   pass
2   1_0_34  ...                   pass
3   1_0_37  ...                   pass
4   1_0_42  ...                   pass
5   1_0_47  ...                   pass
6   1_0_61  ...                   pass
7   1_0_63  ...                   pass
8   1_0_69  ...                   pass

[9 rows x 15 columns]
{
  "idea_id": "rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af",
  "parameter_name": "polyA_to_RNAPIISer2P_nn_um",
  "parameter_value": 0.32063846671128693,
  "observed_statistic": 0.32063846671128693,
  "effect_size": 0.3910108795811784,
  "p_value": 0.001996007984031936,
  "test_method": "one-sided cell-label permutation test (500 permutations)",
  "null_hypothesis": "The median same-cell nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots is no smaller than under permuted RNAPIISer2-P cell labels.",
  "alternative_hypothesis": "The observed median same-cell nearest-neighbor distance is smaller than the cell-label-permuted null distribution.",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 9,
  "n_rows": 9,
  "n_polyA_high_sampled_total": 3600,
  "n_RNAPIISer2P_high_sampled_total": 3600,
  "n_permutations_requested": 500,
  "n_null_permutations_finite": 500,
  "max_high_spots_per_cell": 400,
  "negative_control": "RNAPIISer2-P high-spot cell labels were permuted across cells while preserving per-cell RNAPII counts.",
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv",
  "figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png",
  "notes": [
    "Seeded one-sided cell-label permutation test completed.",
    "Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime."
  ]
}

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af.ipynb:191: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  {

# Path sanity check for saved artifacts.
from pathlib import Path
print('cwd', Path.cwd())
print('RUN_OUTPUT_DIR', RUN_OUTPUT_DIR, 'exists', Path(RUN_OUTPUT_DIR).exists(), 'absolute', Path(RUN_OUTPUT_DIR).resolve())
print('analysis figure_path', analysis_summary.get('figure_path'))
print('figure exists at analysis path', Path(analysis_summary.get('figure_path')).exists())
print('result exists at analysis path', Path(analysis_summary.get('result_path')).exists())
print('workspace root expected fig exists', Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png').exists())
print('basename fig exists', Path('rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png').exists())

cwd /Users/weizexu/Projects/U-Chrom
RUN_OUTPUT_DIR tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg exists True absolute /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg
analysis figure_path tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png
figure exists at analysis path True
result exists at analysis path True
workspace root expected fig exists True
basename fig exists False

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": 0.32063846671128693,
  "p_value": 0.001996007984031936,
  "test_method": "one-sided cell-label permutation test (500 permutations)",
  "effect_size": 0.3910108795811784,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv",
  "notes": [
    "Seeded one-sided cell-label permutation test completed.",
    "Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime."
  ]
}

Auto-discovery idea: RNAPIISer2-P neighborhoods around polyA_RNA spots should survive cell-label negative controls¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶