# Ensure notebook-relative paths resolve from the U-Chrom workspace root.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Xist-marked chrX inter-chromosomal isolation', 'biological_hypothesis': 'Xist_RNA-enriched chrX regions are more spatially isolated from autosomes than Xist_RNA-low chrX regions, indicating chromosome-specific inter-chromosomal exclusion.', 'computable_parameter': 'Xist_chrX_isolation_index = median nearest-neighbor distance from Xist_RNA-high chrX spots to non-chrX spots minus the matched median distance from Xist_RNA-low chrX spots to non-chrX spots, computed within cells and summarized across cells.', 'analysis_plan': 'Subset spots with spots.chrom == chrX. Within each cell, define Xist_RNA-high and Xist_RNA-low chrX spots by within-cell quantiles of tracks.Xist_RNA. For each chrX spot, compute the nearest 3D Euclidean distance to any non-chrX spot in the same cell. Compute the high-minus-low median difference per cell, then summarize across cells and cell types. Test whether the cell-level index is greater than zero using a one-sided signed-rank or permutation test; use random permutation of Xist_RNA values among chrX spots within each cell as a negative control.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nchrX may occupy a more insulated 3D neighborhood when local Xist_RNA signal is high, reflecting chromosome-specific organization.\n\n### Data used\nUse spot coordinates, chromosome labels, cell IDs, trace IDs, cell type metadata, and the Xist_RNA track across Granule, Bergmann, and Purkinje cells.\n\n### Analysis sketch\nFor chrX spots, compare nearest-neighbor distances to non-chrX spots for Xist_RNA-high versus Xist_RNA-low chrX bins within the same cell. Normalize by matched autosomal inter-chromosomal nearest-neighbor distances.\n\n### Expected result\nIf Xist-marked chrX is spatially insulated, the isolation index should be positive: Xist_RNA-high chrX spots should be farther from non-chrX chromatin than matched controls.\n\n### Validation checks\nConfirm required fields, adequate chrX spots per cell type, finite output, a paired/permutation p-value, runtime under budget, deterministic rerun, and chromosome-label permutation as a negative control.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.chrom', 'spots.cell_id', 'spots.trace_id', 'tracks.Xist_RNA', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'Positive Xist_chrX_isolation_index, indicating greater non-chrX separation for Xist_RNA-high chrX spots.', 'complexity': 3, 'idea_id': 'xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection for required fields and alignment assumptions.
import numpy as np
import pandas as pd

print('spots shape:', cdata.spots.shape)
print('coords shape:', cdata.coords.shape, 'unit:', cdata.uns.get('xyz_unit', 'unknown'))
print('spots columns:', list(cdata.spots.columns)[:12])
print('cells shape:', cdata.cells.shape)
print('cell type counts:')
print(cdata.cells['cell_type'].value_counts().sort_index())

xist = cdata.tracks['Xist_RNA']
spots_preview = cdata.spots[['chrom', 'cell_id', 'trace_id']].copy()
spots_preview['Xist_RNA'] = np.asarray(xist)
print('\nchrom counts (tail includes chrX):')
print(spots_preview['chrom'].value_counts().sort_index().tail())
print('\nchrX per-cell counts and finite Xist coverage:')
chrX_preview = spots_preview[spots_preview['chrom'].astype(str) == 'chrX']
coverage = chrX_preview.groupby('cell_id').agg(
    n_chrX=('chrom', 'size'),
    finite_xist=('Xist_RNA', lambda s: int(np.isfinite(s).sum())),
    median_xist=('Xist_RNA', 'median'),
)
print(coverage)
print('\ncoordinate finite fraction:', float(np.isfinite(cdata.coords).all(axis=1).mean()))
print('\npreview:')
display(spots_preview.head())

spots shape: (56036, 6)
coords shape: (56036, 3) unit: um
spots columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells shape: (9, 10)
cell type counts:
cell_type
Bergmann    3
Granule     3
Purkinje    3
Name: count, dtype: int64

chrom counts (tail includes chrX):
chrom
chr6    3330
chr7    2822
chr8    2260
chr9    2946
chrX    1669
Name: count, dtype: int64

chrX per-cell counts and finite Xist coverage:
         n_chrX  finite_xist  median_xist
cell_id                                  
1_0_116     309          309      0.19422
1_0_26        0            0          NaN
1_0_34      101          101      0.21903
1_0_37        0            0          NaN
1_0_42      153          153      0.28604
1_0_47       35           35     -0.10609
1_0_61      496          496     -0.16485
1_0_63      390          390     -0.14861
1_0_69      185          185      0.27764

coordinate finite fraction: 1.0

preview:
   chrom cell_id         trace_id  Xist_RNA
0  chr14  1_0_61  1_0_61_chr14_a2  -0.19382
1   chr2  1_0_61   1_0_61_chr2_a2  -0.17822
2  chr14  1_0_61  1_0_61_chr14_a1  -0.16930
3   chr2  1_0_61   1_0_61_chr2_a2  -0.18713
4  chr14  1_0_61  1_0_61_chr14_a1  -0.18713

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c.ipynb:19: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "text": [

# Main compact exploration: Xist-high chrX nearest-neighbor isolation from non-chrX chromatin.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import json
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image

try:
    from scipy.spatial import cKDTree
except Exception:  # pragma: no cover - fallback for minimal environments
    cKDTree = None

rng = np.random.default_rng(20250308)
result_path = RUN_OUTPUT_DIR / 'xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c_result.csv'
figure_path = RUN_OUTPUT_DIR / 'xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c_statistical_summary.png'

spots = cdata.spots.reset_index(drop=True).copy()
coords = np.asarray(cdata.coords, dtype=float)
spots['cell_id_str'] = spots['cell_id'].astype(str)
spots['chrom_str'] = spots['chrom'].astype(str)
spots['Xist_RNA'] = np.asarray(cdata.tracks['Xist_RNA'], dtype=float)
cell_meta = cdata.cells.copy()
cell_meta.index = cell_meta.index.astype(str)
cell_type_map = cell_meta['cell_type'].astype(str).to_dict()

def nearest_distances_to_nonx(chr_coords, nonx_coords, chunk=1024):
    """Nearest Euclidean distance from each chrX coordinate to non-chrX coordinates."""
    if len(chr_coords) == 0 or len(nonx_coords) == 0:
        return np.full(len(chr_coords), np.nan)
    if cKDTree is not None:
        tree = cKDTree(nonx_coords)
        d, _ = tree.query(chr_coords, k=1, workers=1)
        return np.asarray(d, dtype=float)
    out = np.empty(len(chr_coords), dtype=float)
    for start in range(0, len(chr_coords), chunk):
        block = chr_coords[start:start + chunk]
        dist2 = ((block[:, None, :] - nonx_coords[None, :, :]) ** 2).sum(axis=2)
        out[start:start + chunk] = np.sqrt(np.min(dist2, axis=1))
    return out

def high_low_index(xist_values, nn_values, q_low=0.25, q_high=0.75):
    finite = np.isfinite(xist_values) & np.isfinite(nn_values)
    x = np.asarray(xist_values[finite], dtype=float)
    d = np.asarray(nn_values[finite], dtype=float)
    if len(x) < 8 or np.nanmax(x) == np.nanmin(x):
        return np.nan, np.nan, np.nan, 0, 0, np.nan, np.nan
    low_thr = np.quantile(x, q_low)
    high_thr = np.quantile(x, q_high)
    low_mask = x <= low_thr
    high_mask = x >= high_thr
    if low_mask.sum() < 2 or high_mask.sum() < 2:
        return np.nan, np.nan, np.nan, int(high_mask.sum()), int(low_mask.sum()), high_thr, low_thr
    high_med = float(np.median(d[high_mask]))
    low_med = float(np.median(d[low_mask]))
    return high_med - low_med, high_med, low_med, int(high_mask.sum()), int(low_mask.sum()), float(high_thr), float(low_thr)

cell_records = []
per_cell_arrays = {}
for cell_id, cell_spots in spots.groupby('cell_id_str', sort=True):
    idx = cell_spots.index.to_numpy()
    finite_coord = np.isfinite(coords[idx]).all(axis=1)
    idx = idx[finite_coord]
    cell_spots = spots.iloc[idx]
    chr_mask = cell_spots['chrom_str'].eq('chrX').to_numpy()
    nonx_mask = ~chr_mask
    chr_idx = cell_spots.index.to_numpy()[chr_mask]
    nonx_idx = cell_spots.index.to_numpy()[nonx_mask]
    nn = nearest_distances_to_nonx(coords[chr_idx], coords[nonx_idx])
    xist_chr = spots.loc[chr_idx, 'Xist_RNA'].to_numpy(dtype=float)
    iso, high_med, low_med, n_high, n_low, high_thr, low_thr = high_low_index(xist_chr, nn)
    cell_records.append({
        'cell_id': cell_id,
        'cell_type': cell_type_map.get(cell_id, 'unknown'),
        'n_chrX_spots': int(len(chr_idx)),
        'n_non_chrX_spots': int(len(nonx_idx)),
        'n_finite_chrX': int((np.isfinite(xist_chr) & np.isfinite(nn)).sum()),
        'n_xist_high_chrX': n_high,
        'n_xist_low_chrX': n_low,
        'xist_high_threshold': high_thr,
        'xist_low_threshold': low_thr,
        'median_nn_nonchrX_xist_high_um': high_med,
        'median_nn_nonchrX_xist_low_um': low_med,
        'isolation_index_um': iso,
    })
    if np.isfinite(iso):
        per_cell_arrays[cell_id] = (xist_chr.copy(), nn.copy())

cell_table = pd.DataFrame(cell_records)
eligible = cell_table[np.isfinite(cell_table['isolation_index_um'])].copy()
n_selected_cells = int(len(eligible))
observed_statistic = float(eligible['isolation_index_um'].mean()) if n_selected_cells else float('nan')
observed_median = float(eligible['isolation_index_um'].median()) if n_selected_cells else float('nan')

n_permutations = 500
null_distribution = np.full(n_permutations, np.nan, dtype=float)
if n_selected_cells >= 2:
    eligible_ids = eligible['cell_id'].tolist()
    for b in range(n_permutations):
        permuted_cell_indices = []
        for cell_id in eligible_ids:
            xist_chr, nn = per_cell_arrays[cell_id]
            permuted_xist = rng.permutation(xist_chr)
            perm_iso, *_ = high_low_index(permuted_xist, nn)
            if np.isfinite(perm_iso):
                permuted_cell_indices.append(perm_iso)
        if len(permuted_cell_indices) == n_selected_cells:
            null_distribution[b] = float(np.mean(permuted_cell_indices))
    valid_null = null_distribution[np.isfinite(null_distribution)]
    p_value = float((1 + np.sum(valid_null >= observed_statistic)) / (len(valid_null) + 1)) if len(valid_null) else 1.0
    null_center = float(np.median(valid_null)) if len(valid_null) else 0.0
    effect_size = float(observed_statistic - null_center)
    hypothesis_test_status = 'pass' if len(valid_null) >= 100 else 'insufficient_data'
else:
    valid_null = np.array([], dtype=float)
    p_value = 1.0
    null_center = 0.0
    effect_size = float(observed_statistic) if np.isfinite(observed_statistic) else 0.0
    hypothesis_test_status = 'insufficient_data'

test_method = f'within-cell Xist-label randomization test, one-sided greater, {len(valid_null)} permutations'
null_hypothesis = 'Within each cell, Xist_RNA values on chrX spots are exchangeable with respect to nearest non-chrX distance; the mean high-minus-low isolation index is no greater than expected by random labels.'
alternative_hypothesis = 'Xist_RNA-high chrX spots have larger nearest-neighbor distances to non-chrX spots than Xist_RNA-low chrX spots, yielding a positive mean isolation index.'

result_table = eligible.copy()
result_table['observed_statistic'] = observed_statistic
result_table['effect_size'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'Xist_chrX_isolation_index',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'observed_median_cell_index_um': observed_median,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_selected_cells': n_selected_cells,
    'n_rows': int(len(result_table)),
    'n_permutations_requested': n_permutations,
    'n_valid_null_permutations': int(len(valid_null)),
    'result_path': str(result_path),
    'figure_path': str(figure_path),
    'xyz_unit': cdata.uns.get('xyz_unit', 'um'),
    'notes': [
        'Nearest non-chrX distance computed within each cell; Xist high/low are within-cell chrX quartile bins.',
        'Permutation negative control shuffles Xist_RNA labels only among chrX spots within each cell.'
    ],
}

# Statistical figure: group comparison by cell plus observed statistic against the permutation null.
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.5), facecolor='white')
ax = axes[0]
if len(result_table):
    x_positions = {'low': 0, 'high': 1}
    for _, row in result_table.iterrows():
        ax.plot([x_positions['low'], x_positions['high']],
                [row['median_nn_nonchrX_xist_low_um'], row['median_nn_nonchrX_xist_high_um']],
                marker='o', linewidth=1.4, alpha=0.8, label=row['cell_type'])
    # de-duplicate legend labels
    handles, labels = ax.get_legend_handles_labels()
    keep = dict(zip(labels, handles))
    ax.legend(keep.values(), keep.keys(), title='Cell type', fontsize=8, title_fontsize=8, frameon=False)
ax.set_xticks([0, 1], ['Xist_RNA-low\nchrX quartile', 'Xist_RNA-high\nchrX quartile'])
ax.set_ylabel('Median nearest distance to non-chrX spots (um)')
ax.set_title('Paired per-cell nearest-neighbor distances')
ax.grid(axis='y', alpha=0.25)

ax = axes[1]
if len(valid_null):
    ax.hist(valid_null, bins=28, color='#b8c6d9', edgecolor='white', label='Shuffled Xist labels\nwithin cells')
else:
    ax.text(0.5, 0.5, 'No valid null permutations', ha='center', va='center', transform=ax.transAxes)
ax.axvline(observed_statistic, color='#b22222', linewidth=2.5, label='Observed mean index')
ax.axvline(null_center, color='#2c7fb8', linewidth=1.8, linestyle='--', label='Null median')
ax.set_xlabel('Mean cell-level isolation index (high - low, um)')
ax.set_ylabel('Permutation count')
ax.set_title('Hypothesis-test evidence')
annotation = (f'p = {p_value:.4f}\n'
              f'effect = {effect_size:.4f} um\n'
              f'n cells = {n_selected_cells}\n'
              f'{len(valid_null)} permutations')
ax.text(0.98, 0.96, annotation, transform=ax.transAxes, ha='right', va='top',
        fontsize=9, bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='0.7', alpha=0.95))
ax.legend(fontsize=8, frameon=False, loc='upper left')
ax.grid(axis='y', alpha=0.25)
fig.suptitle('Xist-marked chrX inter-chromosomal isolation audit', fontsize=13)
fig.tight_layout(rect=[0, 0, 1, 0.94])
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
plt.show()
display(Image(filename=str(figure_path)))

print(json.dumps(analysis_summary, indent=2))
display(result_table)

<IPython.core.display.Image object>
{
  "idea_id": "xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c",
  "parameter_name": "Xist_chrX_isolation_index",
  "parameter_value": 0.028422877107321668,
  "observed_statistic": 0.028422877107321668,
  "observed_median_cell_index_um": 0.01819379887524314,
  "effect_size": 0.028334359733265868,
  "p_value": 0.011976047904191617,
  "test_method": "within-cell Xist-label randomization test, one-sided greater, 500 permutations",
  "null_hypothesis": "Within each cell, Xist_RNA values on chrX spots are exchangeable with respect to nearest non-chrX distance; the mean high-minus-low isolation index is no greater than expected by random labels.",
  "alternative_hypothesis": "Xist_RNA-high chrX spots have larger nearest-neighbor distances to non-chrX spots than Xist_RNA-low chrX spots, yielding a positive mean isolation index.",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 7,
  "n_rows": 7,
  "n_permutations_requested": 500,
  "n_valid_null_permutations": 500,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c_result.csv",
  "figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c_statistical_summary.png",
  "xyz_unit": "um",
  "notes": [
    "Nearest non-chrX distance computed within each cell; Xist high/low are within-cell chrX quartile bins.",
    "Permutation negative control shuffles Xist_RNA labels only among chrX spots within each cell."
  ]
}
   cell_id  ... hypothesis_test_status
0  1_0_116  ...                   pass
2   1_0_34  ...                   pass
4   1_0_42  ...                   pass
5   1_0_47  ...                   pass
6   1_0_61  ...                   pass
7   1_0_63  ...                   pass
8   1_0_69  ...                   pass

[7 rows x 17 columns]

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c.ipynb:200: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  "cell_type": "code",

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test": "pass",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass"
  },
  "parameter_value": 0.028422877107321668,
  "p_value": 0.011976047904191617,
  "test_method": "within-cell Xist-label randomization test, one-sided greater, 500 permutations",
  "effect_size": 0.028334359733265868,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/xist-marked-chrx-inter-chromosomal-isolation-3bd44d703c_result.csv",
  "notes": [
    "Nearest non-chrX distance computed within each cell; Xist high/low are within-cell chrX quartile bins.",
    "Permutation negative control shuffles Xist_RNA labels only among chrX spots within each cell."
  ]
}

Auto-discovery idea: Xist-marked chrX inter-chromosomal isolation¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶