# Ensure relative data paths resolve from the repository root when the notebook kernel starts in notebooks/.
from pathlib import Path
import os
root = Path('/Users/weizexu/Projects/U-Chrom')
os.chdir(root)
print('cwd', Path.cwd())
cwd /Users/weizexu/Projects/U-Chrom
Auto-discovery idea: RNAPIISer2-P neighborhoods around polyA_RNA spots should survive cell-label negative controls¶
Rationale¶
Transcriptionally engaged nuclear regions should place mature RNA signal near elongating polymerase signal. A robustness hypothesis is that this proximity should disappear when cell identities are permuted.
Data used¶
Use 3D coordinates, cell IDs, polyA_RNA track values, and RNAPIISer2-P track values for all available cell types.
Analysis sketch¶
Select high polyA_RNA spots and high RNAPIISer2-P spots within each cell. The single parameter is the median nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots within the same cell.
Expected result¶
The observed nearest-neighbor distance should be smaller than distances obtained after permuting cell labels for RNAPIISer2-P spots.
Validation checks¶
Check required fields, enough cells and high-signal spots, finite distance output, a permutation p-value, deterministic seeded rerun, runtime, and a negative control using cell-label permutation.
from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema
IDEA = DiscoveryIdea.from_dict({'idea_title': 'RNAPIISer2-P neighborhoods around polyA_RNA spots should survive cell-label negative controls', 'biological_hypothesis': 'High polyA_RNA chromatin-associated spots are spatially closer to high RNAPIISer2-P spots within the same cell than expected under a cell-label permutation negative control.', 'computable_parameter': 'polyA_to_RNAPIISer2P_nn_um: median same-cell nearest-neighbor 3D distance from top-quartile polyA_RNA spots to top-quartile RNAPIISer2-P spots.', 'analysis_plan': 'Within each cell, identify top-quartile tracks.polyA_RNA spots and top-quartile tracks.RNAPIISer2-P spots. Compute the nearest 3D distance from each selected polyA_RNA spot to any selected RNAPIISer2-P spot in the same cell using coords, then aggregate the median across cells. Test whether the observed parameter is smaller than a null distribution generated by permuting RNAPIISer2-P spot cell labels while preserving global signal ranks.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nTranscriptionally engaged nuclear regions should place mature RNA signal near elongating polymerase signal. A robustness hypothesis is that this proximity should disappear when cell identities are permuted.\n\n### Data used\nUse 3D coordinates, cell IDs, polyA_RNA track values, and RNAPIISer2-P track values for all available cell types.\n\n### Analysis sketch\nSelect high polyA_RNA spots and high RNAPIISer2-P spots within each cell. The single parameter is the median nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots within the same cell.\n\n### Expected result\nThe observed nearest-neighbor distance should be smaller than distances obtained after permuting cell labels for RNAPIISer2-P spots.\n\n### Validation checks\nCheck required fields, enough cells and high-signal spots, finite distance output, a permutation p-value, deterministic seeded rerun, runtime, and a negative control using cell-label permutation.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'tracks.polyA_RNA', 'tracks.RNAPIISer2-P', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'polyA_to_RNAPIISer2P_nn_um should be lower than the cell-label-permuted null distribution.', 'complexity': 3, 'idea_id': 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
print(cdata)
print(cdata.describe_for_agent(max_items=20))
rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af
ChromData: n_spots=56036, n_traces=213, n_cells=9
spots: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells: ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
cellm: {'umap': (9, 2)}
tracks: ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
traces: ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
uns: ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
linked_adata: (9, 60)
# ChromData discovery schema
dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells
modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association
chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]
known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses
verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters
Required data checks¶
review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()
{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}
Exploration¶
The code agent can freely add cells below this point.
Critique and compact analysis plan¶
This idea is testable with the available 3D spot coordinates, spot-level cell IDs, and IF track intensities. I will treat each chromatin spot as a candidate locus, select top-quartile polyA_RNA and RNAPIISer2-P spots within each cell, and compute per-cell nearest-neighbor distances from high-polyA spots to high-RNAPII spots. The explicit negative control will keep all RNAPIISer2-P coordinates and intensities intact but randomly permute RNAPIISer2-P cell labels, so any same-cell proximity should be lost if it depends on true cell identity. Because only 9 cells are available, the analysis is exploratory; the statistic is the median over per-cell medians, tested against a bounded seeded permutation null.
# Lightweight data inspection: verify required columns/tracks and spot-level coverage.
import numpy as np
import pandas as pd
required_spot_cols = ['cell_id']
required_tracks = ['polyA_RNA', 'RNAPIISer2-P']
print('spots shape:', cdata.spots.shape)
print('spots columns:', list(cdata.spots.columns)[:12])
print('cells shape:', cdata.cells.shape)
print('cell type counts:', cdata.cells['cell_type'].value_counts().to_dict())
print('tracks shape:', cdata.tracks.shape)
print('has required spot cols:', {col: col in cdata.spots.columns for col in required_spot_cols})
print('has required tracks:', {trk: trk in cdata.tracks.columns for trk in required_tracks})
print('spots index preview:', cdata.spots.head(3).to_dict('records'))
print('coords attr exists:', hasattr(cdata, 'coords'))
if hasattr(cdata, 'coords'):
coords_arr = np.asarray(cdata.coords)
print('coords shape:', coords_arr.shape, 'finite fraction:', float(np.isfinite(coords_arr).mean()))
coverage = pd.DataFrame({
'track': required_tracks,
'finite_fraction': [float(np.isfinite(cdata.tracks[t]).mean()) for t in required_tracks],
'q75': [float(np.nanquantile(cdata.tracks[t], 0.75)) for t in required_tracks],
'nonzero_fraction': [float((cdata.tracks[t].fillna(0) > 0).mean()) for t in required_tracks],
})
display(coverage)
spots shape: (56036, 6)
spots columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells shape: (9, 10)
cell type counts: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
tracks shape: (56036, 62)
has required spot cols: {'cell_id': True}
has required tracks: {'polyA_RNA': True, 'RNAPIISer2-P': True}
spots index preview: [{'chrom': 'chr14', 'start': 30425000, 'end': 30450000, 'trace_id': '1_0_61_chr14_a2', 'cell_id': '1_0_61', 'name': 'chr14-1096'}, {'chrom': 'chr2', 'start': 99725000, 'end': 99750000, 'trace_id': '1_0_61_chr2_a2', 'cell_id': '1_0_61', 'name': 'chr2-3864'}, {'chrom': 'chr14', 'start': 39625000, 'end': 39650000, 'trace_id': '1_0_61_chr14_a1', 'cell_id': '1_0_61', 'name': 'chr14-1464'}]
coords attr exists: True
coords shape: (56036, 3) finite fraction: 1.0
track finite_fraction q75 nonzero_fraction
0 polyA_RNA 1.0 0.38913 0.357431
1 RNAPIISer2-P 1.0 0.76854 0.510832
# Main compact exploration: top-quartile polyA_RNA to RNAPIISer2-P nearest-neighbor distance
# with a seeded cell-label permutation negative control.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
import numpy as np
import pandas as pd
import json
from pathlib import Path
rng = np.random.default_rng(8675309)
result_path = RUN_OUTPUT_DIR / 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv'
fig_path = RUN_OUTPUT_DIR / 'rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png'
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
coords = np.asarray(cdata.coords, dtype=float)
spots = cdata.spots.reset_index(drop=True).copy()
tracks = cdata.tracks.reset_index(drop=True)
spots['polyA_RNA'] = pd.to_numeric(tracks['polyA_RNA'], errors='coerce')
spots['RNAPIISer2_P'] = pd.to_numeric(tracks['RNAPIISer2-P'], errors='coerce')
spots[['x_um', 'y_um', 'z_um']] = coords
cell_type_map = cdata.cells['cell_type'].astype(str).to_dict()
spots['cell_type'] = spots['cell_id'].map(cell_type_map)
# Top quartile is computed within each cell. To keep pairwise distance work bounded,
# sample at most this many high spots per marker per cell after thresholding.
max_high_spots_per_cell = 400
cell_ids = sorted(spots['cell_id'].dropna().astype(str).unique())
selected_polyA = {}
selected_rnap = {}
cell_rows = []
def _sample_indices(indices, max_n):
indices = np.asarray(indices, dtype=int)
if len(indices) > max_n:
return np.sort(rng.choice(indices, size=max_n, replace=False))
return np.sort(indices)
def _nn_median(poly_coords, rnap_coords):
if len(poly_coords) == 0 or len(rnap_coords) == 0:
return np.nan
# Chunk polyA queries to avoid a large all-vs-all allocation.
mins = []
chunk = 200
for start in range(0, len(poly_coords), chunk):
p = poly_coords[start:start+chunk]
d2 = ((p[:, None, :] - rnap_coords[None, :, :]) ** 2).sum(axis=2)
mins.append(np.sqrt(d2.min(axis=1)))
return float(np.median(np.concatenate(mins)))
for cid in cell_ids:
mask = (spots['cell_id'].astype(str).values == cid)
idx = np.flatnonzero(mask)
sub = spots.iloc[idx]
poly_q75 = float(np.nanquantile(sub['polyA_RNA'], 0.75))
rnap_q75 = float(np.nanquantile(sub['RNAPIISer2_P'], 0.75))
poly_idx_all = idx[np.flatnonzero(np.isfinite(sub['polyA_RNA'].values) & (sub['polyA_RNA'].values >= poly_q75))]
rnap_idx_all = idx[np.flatnonzero(np.isfinite(sub['RNAPIISer2_P'].values) & (sub['RNAPIISer2_P'].values >= rnap_q75))]
poly_idx = _sample_indices(poly_idx_all, max_high_spots_per_cell)
rnap_idx = _sample_indices(rnap_idx_all, max_high_spots_per_cell)
selected_polyA[cid] = poly_idx
selected_rnap[cid] = rnap_idx
med_nn = _nn_median(coords[poly_idx], coords[rnap_idx])
cell_rows.append({
'cell_id': cid,
'cell_type': str(cell_type_map.get(cid, 'unknown')),
'n_polyA_high_all': int(len(poly_idx_all)),
'n_RNAPIISer2P_high_all': int(len(rnap_idx_all)),
'n_polyA_high_sampled': int(len(poly_idx)),
'n_RNAPIISer2P_high_sampled': int(len(rnap_idx)),
'polyA_q75': poly_q75,
'RNAPIISer2P_q75': rnap_q75,
'median_polyA_to_RNAPIISer2P_nn_um': med_nn,
})
per_cell = pd.DataFrame(cell_rows)
valid_cell_medians = per_cell['median_polyA_to_RNAPIISer2P_nn_um'].replace([np.inf, -np.inf], np.nan).dropna().values
observed_statistic = float(np.median(valid_cell_medians)) if len(valid_cell_medians) else np.nan
# Permutation null: pool the sampled high RNAPIISer2-P spots and shuffle their cell labels while
# preserving the number of RNAPII spots assigned to each cell. polyA cell labels remain fixed.
rnap_pool = np.concatenate([selected_rnap[cid] for cid in cell_ids if len(selected_rnap[cid])])
rnap_counts = {cid: len(selected_rnap[cid]) for cid in cell_ids}
n_permutations = 500
null_stats = []
if len(valid_cell_medians) >= 2 and len(rnap_pool) > 0 and all(len(selected_polyA[cid]) > 0 for cid in cell_ids):
for _ in range(n_permutations):
shuffled_pool = rng.permutation(rnap_pool)
cursor = 0
perm_meds = []
for cid in cell_ids:
n_r = rnap_counts[cid]
assigned_rnap = shuffled_pool[cursor:cursor+n_r]
cursor += n_r
if n_r > 0 and len(selected_polyA[cid]) > 0:
perm_meds.append(_nn_median(coords[selected_polyA[cid]], coords[assigned_rnap]))
perm_meds = np.asarray(perm_meds, dtype=float)
perm_meds = perm_meds[np.isfinite(perm_meds)]
if len(perm_meds):
null_stats.append(float(np.median(perm_meds)))
null_stats = np.asarray(null_stats, dtype=float)
if len(null_stats) >= 100 and np.isfinite(observed_statistic):
# One-sided test: observed proximity is smaller than permuted-cell null.
p_value = float((1 + np.sum(null_stats <= observed_statistic)) / (len(null_stats) + 1))
null_median = float(np.median(null_stats))
effect_size = float(null_median - observed_statistic) # positive means observed is closer than null
hypothesis_test_status = 'pass'
test_note = 'Seeded one-sided cell-label permutation test completed.'
else:
p_value = 1.0
null_median = float(np.nanmedian(null_stats)) if len(null_stats) else np.nan
effect_size = float(null_median - observed_statistic) if np.isfinite(null_median) and np.isfinite(observed_statistic) else 0.0
hypothesis_test_status = 'insufficient_data'
test_note = 'Too few finite cell medians or permutation draws for a formal test.'
test_method = f'one-sided cell-label permutation test ({len(null_stats)} permutations)'
null_hypothesis = 'The median same-cell nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots is no smaller than under permuted RNAPIISer2-P cell labels.'
alternative_hypothesis = 'The observed median same-cell nearest-neighbor distance is smaller than the cell-label-permuted null distribution.'
# Add required test columns to every result row and write the compact table.
result_table = per_cell.copy()
result_table['observed_statistic_um'] = observed_statistic
result_table['null_median_um'] = null_median
result_table['effect_size_um_null_minus_observed'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)
analysis_summary = {
'idea_id': IDEA.idea_id,
'parameter_name': 'polyA_to_RNAPIISer2P_nn_um',
'parameter_value': observed_statistic,
'observed_statistic': observed_statistic,
'effect_size': effect_size,
'p_value': p_value,
'test_method': test_method,
'null_hypothesis': null_hypothesis,
'alternative_hypothesis': alternative_hypothesis,
'hypothesis_test_status': hypothesis_test_status,
'n_selected_cells': int(result_table['cell_id'].nunique()),
'n_rows': int(len(result_table)),
'n_polyA_high_sampled_total': int(result_table['n_polyA_high_sampled'].sum()),
'n_RNAPIISer2P_high_sampled_total': int(result_table['n_RNAPIISer2P_high_sampled'].sum()),
'n_permutations_requested': int(n_permutations),
'n_null_permutations_finite': int(len(null_stats)),
'max_high_spots_per_cell': int(max_high_spots_per_cell),
'negative_control': 'RNAPIISer2-P high-spot cell labels were permuted across cells while preserving per-cell RNAPII counts.',
'result_path': str(result_path),
'figure_path': str(fig_path),
'notes': [test_note, 'Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime.'],
}
# Statistical figure: null distribution with observed statistic, plus per-cell observed medians.
plt.rcParams.update({'figure.facecolor': 'white', 'axes.facecolor': 'white', 'font.size': 10})
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), constrained_layout=True)
ax = axes[0]
if len(null_stats):
ax.hist(null_stats, bins=25, color='#bdbdbd', edgecolor='white', label='Permuted RNAPIISer2-P cell labels')
ax.axvline(observed_statistic, color='#b2182b', linewidth=2.5, label='Observed')
ax.axvline(null_median, color='#2166ac', linewidth=2, linestyle='--', label='Null median')
ax.set_xlabel('Median per-cell nearest-neighbor distance (µm)')
ax.set_ylabel('Permutation count')
ax.set_title('Hypothesis-test evidence')
ax.legend(frameon=False, fontsize=8)
ax.text(0.03, 0.97, f'p = {p_value:.4f}\neffect = {effect_size:.3f} µm\nn = {len(result_table)} cells\n{test_method}',
transform=ax.transAxes, va='top', ha='left', fontsize=8,
bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#cccccc', alpha=0.9))
ax2 = axes[1]
order = result_table.sort_values(['cell_type', 'cell_id']).reset_index(drop=True)
colors = {'Granule': '#1b9e77', 'Bergmann': '#d95f02', 'Purkinje': '#7570b3'}
bar_colors = [colors.get(ct, '#666666') for ct in order['cell_type']]
ax2.bar(np.arange(len(order)), order['median_polyA_to_RNAPIISer2P_nn_um'], color=bar_colors, edgecolor='black', linewidth=0.4)
ax2.axhline(observed_statistic, color='#b2182b', linestyle='-', linewidth=1.5, label='Observed median')
ax2.axhline(null_median, color='#2166ac', linestyle='--', linewidth=1.5, label='Null median')
ax2.set_xticks(np.arange(len(order)))
ax2.set_xticklabels(order['cell_type'] + '\n' + order['cell_id'].astype(str), rotation=45, ha='right', fontsize=7)
ax2.set_ylabel('Nearest-neighbor distance (µm)')
ax2.set_title('Per-cell high polyA → high RNAPIISer2-P')
ax2.legend(frameon=False, fontsize=8)
for spine_ax in axes:
spine_ax.spines['top'].set_visible(False)
spine_ax.spines['right'].set_visible(False)
fig.suptitle('High polyA_RNA spots are compared with high RNAPIISer2-P neighborhoods', fontsize=12)
fig.savefig(fig_path, dpi=200, bbox_inches='tight')
plt.show()
display(Image(filename=str(fig_path)))
display(result_table)
print(json.dumps(analysis_summary, indent=2))
<IPython.core.display.Image object>
cell_id ... hypothesis_test_status
0 1_0_116 ... pass
1 1_0_26 ... pass
2 1_0_34 ... pass
3 1_0_37 ... pass
4 1_0_42 ... pass
5 1_0_47 ... pass
6 1_0_61 ... pass
7 1_0_63 ... pass
8 1_0_69 ... pass
[9 rows x 15 columns]
{
"idea_id": "rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af",
"parameter_name": "polyA_to_RNAPIISer2P_nn_um",
"parameter_value": 0.32063846671128693,
"observed_statistic": 0.32063846671128693,
"effect_size": 0.3910108795811784,
"p_value": 0.001996007984031936,
"test_method": "one-sided cell-label permutation test (500 permutations)",
"null_hypothesis": "The median same-cell nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots is no smaller than under permuted RNAPIISer2-P cell labels.",
"alternative_hypothesis": "The observed median same-cell nearest-neighbor distance is smaller than the cell-label-permuted null distribution.",
"hypothesis_test_status": "pass",
"n_selected_cells": 9,
"n_rows": 9,
"n_polyA_high_sampled_total": 3600,
"n_RNAPIISer2P_high_sampled_total": 3600,
"n_permutations_requested": 500,
"n_null_permutations_finite": 500,
"max_high_spots_per_cell": 400,
"negative_control": "RNAPIISer2-P high-spot cell labels were permuted across cells while preserving per-cell RNAPII counts.",
"result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv",
"figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png",
"notes": [
"Seeded one-sided cell-label permutation test completed.",
"Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime."
]
}
tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af.ipynb:191: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
{
# Path sanity check for saved artifacts.
from pathlib import Path
print('cwd', Path.cwd())
print('RUN_OUTPUT_DIR', RUN_OUTPUT_DIR, 'exists', Path(RUN_OUTPUT_DIR).exists(), 'absolute', Path(RUN_OUTPUT_DIR).resolve())
print('analysis figure_path', analysis_summary.get('figure_path'))
print('figure exists at analysis path', Path(analysis_summary.get('figure_path')).exists())
print('result exists at analysis path', Path(analysis_summary.get('result_path')).exists())
print('workspace root expected fig exists', Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png').exists())
print('basename fig exists', Path('rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png').exists())
cwd /Users/weizexu/Projects/U-Chrom RUN_OUTPUT_DIR tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg exists True absolute /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg analysis figure_path tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_statistical_summary.png figure exists at analysis path True result exists at analysis path True workspace root expected fig exists True basename fig exists False
Runner verification summary¶
This scaffolded section is generated by U-Chrom. The notebook agent executes it after exploration, and the runner re-executes it during final verification.
checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')
def _check_keys(prefix):
return [key for key in checks if key == prefix or key.startswith(prefix + ':')]
def _set_check(prefix, value):
keys = _check_keys(prefix)
if not keys:
checks[prefix] = value
return
for key in keys:
checks[key] = value
def _check_status(prefix):
values = [checks[key] for key in _check_keys(prefix)]
if not values:
return None
if 'fail' in values:
return 'fail'
if all(value == 'pass' for value in values):
return 'pass'
return values[0]
_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
aligned = True
if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
_set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
n_cells = analysis_summary.get('n_selected_cells')
if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
n_cells = len(result_table)
if n_cells is None:
n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
_set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
n_rows = analysis_summary.get('n_rows')
if n_rows is None:
n_rows = len(result_table) if result_table is not None else 0
_set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
value = analysis_summary.get('parameter_value')
_set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
p_value = analysis_summary.get('p_value')
test_method = analysis_summary.get('test_method')
null_hypothesis = analysis_summary.get('null_hypothesis')
alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
observed_statistic = analysis_summary.get('observed_statistic')
effect_size = analysis_summary.get('effect_size')
hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
try:
p_float = float(p_value)
except Exception:
p_float = np.nan
try:
stat_float = float(observed_statistic)
except Exception:
stat_float = np.nan
try:
effect_float = float(effect_size)
except Exception:
effect_float = np.nan
has_required_test = (
test_method is not None
and str(test_method).strip() != ''
and null_hypothesis is not None
and str(null_hypothesis).strip() != ''
and alternative_hypothesis is not None
and str(alternative_hypothesis).strip() != ''
and np.isfinite(p_float)
and 0.0 <= p_float <= 1.0
and np.isfinite(stat_float)
and np.isfinite(effect_float)
and hypothesis_test_status != 'insufficient_data'
)
if result_table is not None and hasattr(result_table, 'columns'):
has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
else:
has_required_test = False
_set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
if not has_required_test:
notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
test_method_text = str(analysis_summary.get('test_method', '')).lower()
summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
result_columns_text = ''
if result_table is not None and hasattr(result_table, 'columns'):
result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
has_control_or_permutation = any(
token in control_text
for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
)
_set_check(
'negative_control_or_permutation',
'pass' if has_control_or_permutation else 'not_implemented',
)
for check in list(checks):
if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
checks[check] = 'not_implemented'
required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
if _check_status(check) == 'fail':
status = 'fail'
notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
status = 'fail'
notes.append('analysis produced no result rows')
verification = {
'idea_id': IDEA.idea_id,
'status': status,
'checks': checks,
'parameter_value': analysis_summary.get('parameter_value'),
'p_value': analysis_summary.get('p_value'),
'test_method': analysis_summary.get('test_method'),
'effect_size': analysis_summary.get('effect_size'),
'result_path': analysis_summary.get('result_path'),
'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))
{
"idea_id": "rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af",
"status": "pass",
"checks": {
"required_fields_exist": "pass",
"minimum_cell_count": "pass",
"minimum_spot_or_trace_count": "pass",
"finite_numeric_output": "pass",
"statistical_hypothesis_test_with_p_value": "not_run",
"runtime_under_budget": "not_run",
"deterministic_rerun": "not_run",
"negative_control_or_permutation": "pass",
"statistical_hypothesis_test": "pass"
},
"parameter_value": 0.32063846671128693,
"p_value": 0.001996007984031936,
"test_method": "one-sided cell-label permutation test (500 permutations)",
"effect_size": 0.3910108795811784,
"result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rnapiiser2-p-neighborhoods-around-polya-rna-spot-6bd867a3af_result.csv",
"notes": [
"Seeded one-sided cell-label permutation test completed.",
"Top-quartile thresholds were computed within each cell; selected high spots were capped per marker/cell for bounded runtime."
]
}
Final interpretation¶
Hypothesis. High polyA_RNA chromatin-associated spots are spatially closer to high RNAPIISer2-P spots within the same cell than expected under a cell-label permutation negative control.
Exploration. The notebook operationalized the idea as polyA_to_RNAPIISer2P_nn_um: median same-cell nearest-neighbor 3D distance from top-quartile polyA_RNA spots to top-quartile RNAPIISer2-P spots. using modalities chromatin_tracing, if_tracks, cell_metadata in cell type(s) Granule, Bergmann, Purkinje. Required data fields checked: coords, spots.cell_id, tracks.polyA_RNA, tracks.RNAPIISer2-P, cells.cell_type.
Statistical evidence. U-Chrom runner status: Notebook verified. Test: one-sided cell-label permutation test (500 permutations). Observed statistic: 0.3206; effect size: 0.391; parameter value: 0.3206; p-value: 0.001996.
Conclusion. Contradicted (Opposite direction). The hypothesis test is significant, but the observed effect is in the opposite direction from the idea.
What verification means. Notebook verified means the run passed schema/data checks, produced finite numeric output, and included an explicit p-value/effect-size hypothesis test. It does not mean the biological hypothesis is automatically correct.
Checks passed. deterministic_rerun, finite_numeric_output, minimum_cell_count, minimum_spot_or_trace_count, negative_control_or_permutation, required_fields_exist, runtime_under_budget, statistical_hypothesis_test.
Main caveat. Seeded one-sided cell-label permutation test completed.
Final interpretation¶
The scaffold was completed with a lightweight inspection cell and a main statistical exploration. Required fields were present, coordinates were finite, and the analysis selected top-quartile polyA_RNA and RNAPIISer2-P spots within each of the 9 cells, capped at 400 high spots per marker/cell for bounded runtime.
Hypothesis test. The observed median per-cell nearest-neighbor distance from high polyA_RNA spots to high RNAPIISer2-P spots was 0.3206 µm. A seeded one-sided RNAPIISer2-P cell-label permutation test with 500 finite permutations gave p = 0.001996 and effect size = 0.3910 µm (null median minus observed), supporting shorter same-cell neighborhoods than the permuted control.
Visual QA. The saved statistical figure is non-blank and readable: it shows the permutation null distribution with observed and null-median reference lines, plus a per-cell bar comparison with axes in µm and annotated p-value/effect size/sample size/test method. No misleading decoration was apparent.