# Ensure relative data paths in scaffold resolve from the workspace root.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Lamina association of repetitive satellite-rich chromatin', 'biological_hypothesis': 'Satellite-rich chromatin is preferentially positioned near the nuclear lamina compared with satellite-low chromatin.', 'computable_parameter': 'Median difference in tracks.n_per_dist(um) between satellite-high and satellite-low spots, where satellite score is tracks.MajSat plus tracks.MinSat.', 'analysis_plan': 'For each cell_id, compute satellite score as tracks.MajSat + tracks.MinSat. Define satellite-high and satellite-low spots by within-cell upper and lower quartiles, using only finite values. Compute the pooled median tracks.n_per_dist(um) in high and low groups and report high minus low. Test significance by permuting satellite scores within each cell before quartile assignment.', 'modalities': ['if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nRepetitive heterochromatic elements are often enriched near peripheral or lamina-associated nuclear compartments.\n\n### Data used\nUse per-spot `MajSat`, `MinSat`, `LaminB1`, and nuclear-periphery distance, with cell IDs and cell types for stratification checks.\n\n### Analysis sketch\nClassify spots into high versus low satellite signal using a within-cell percentile threshold, then compare their distance to the nuclear periphery. The single reported parameter is the median periphery-distance difference.\n\n### Expected result\nSatellite-high spots should lie closer to the lamina, giving a negative high-minus-low periphery-distance difference.\n\n### Validation checks\nRequire fields, enough cells and spots in both bins, finite distance difference, p-value from within-cell label permutation, runtime limit, deterministic rerun, and a control using shuffled satellite labels.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['spots.cell_id', 'tracks.MajSat', 'tracks.MinSat', 'tracks.LaminB1', 'tracks.n_per_dist(um)', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_at_least_9_total_and_at_least_3_per_cell_type', 'minimum_spot_or_trace_count_at_least_500_spots_per_satellite_bin', 'finite_numeric_output', 'statistical_hypothesis_test_within_cell_permutation_p_value', 'runtime_under_budget_5_minutes', 'deterministic_rerun_fixed_seed', 'negative_control_or_permutation_shuffle_satellite_score_within_cell'], 'expected_direction': 'Negative high-minus-low difference: satellite-high spots have smaller tracks.n_per_dist(um), indicating closer lamina/periphery positioning.', 'complexity': 2, 'idea_id': 'lamina-association-of-repetitive-satellite-rich--6f2c6827ff', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

lamina-association-of-repetitive-satellite-rich--6f2c6827ff
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection: selected fields, finite coverage, and cell-type counts.
import numpy as np
import pandas as pd

selected_tracks = ['MajSat', 'MinSat', 'LaminB1', 'n_per_dist(um)']
print('spots type/shape:', type(cdata.spots), getattr(cdata.spots, 'shape', None))
print('tracks type/shape:', type(cdata.tracks), getattr(cdata.tracks, 'shape', None))
print('cells type/shape:', type(cdata.cells), getattr(cdata.cells, 'shape', None))

inspection_df = pd.concat([
    cdata.spots[['cell_id']].reset_index(drop=True),
    cdata.tracks[selected_tracks].reset_index(drop=True),
], axis=1)
finite_summary = inspection_df[selected_tracks].apply(lambda s: np.isfinite(pd.to_numeric(s, errors='coerce')).mean()).rename('finite_fraction')
cell_counts = cdata.cells['cell_type'].value_counts().sort_index()
spot_counts_by_cell = inspection_df.groupby('cell_id').size().rename('n_spots')
print('\nFinite coverage by selected track:')
print(finite_summary.to_string())
print('\nCell type counts:')
print(cell_counts.to_string())
print('\nSpot counts per cell (first 9):')
print(spot_counts_by_cell.head(9).to_string())
print('\nPreview:')
display(inspection_df.head())

spots type/shape: <class 'pandas.core.frame.DataFrame'> (56036, 6)
tracks type/shape: <class 'pandas.core.frame.DataFrame'> (56036, 62)
cells type/shape: <class 'pandas.core.frame.DataFrame'> (9, 10)

Finite coverage by selected track:
MajSat            1.0
MinSat            1.0
LaminB1           1.0
n_per_dist(um)    1.0

Cell type counts:
cell_type
Bergmann    3
Granule     3
Purkinje    3

Spot counts per cell (first 9):
cell_id
1_0_116    11659
1_0_26      4225
1_0_34      3932
1_0_37      5238
1_0_42      4183
1_0_47      4682
1_0_61     11283
1_0_63      7614
1_0_69      3220

Preview:
  cell_id   MajSat   MinSat  LaminB1  n_per_dist(um)
0  1_0_61 -0.16847 -0.28460 -1.46070    1.812949e-01
1  1_0_61 -0.37634 -0.11773 -0.92373    5.474538e-02
2  1_0_61 -0.39949 -0.32910 -1.53310    0.000000e+00
3  1_0_61 -0.39371 -0.25957 -1.14700   -8.881784e-16
4  1_0_61 -0.38581 -0.25123 -1.10470    3.141663e-02

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/lamina-association-of-repetitive-satellite-rich--6f2c6827ff.ipynb:16: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "### Analysis sketch\n",

# Main statistical exploration: within-cell satellite quartiles and permutation test.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display
import json
import numpy as np
import pandas as pd

rng = np.random.default_rng(20250220)
N_PERMUTATIONS = 500
RESULT_PATH = RUN_OUTPUT_DIR / 'lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv'
FIGURE_PATH = RUN_OUTPUT_DIR / 'lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png'

# Build a compact aligned spot table from schema-approved fields.
df = pd.concat([
    cdata.spots[['cell_id']].reset_index(drop=True),
    cdata.tracks[['MajSat', 'MinSat', 'LaminB1', 'n_per_dist(um)']].reset_index(drop=True),
], axis=1)
df['cell_id'] = df['cell_id'].astype(str)
cell_meta = cdata.cells[['cell_type']].copy()
cell_meta.index = cell_meta.index.astype(str)
df = df.join(cell_meta, on='cell_id')
for col in ['MajSat', 'MinSat', 'LaminB1', 'n_per_dist(um)']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df['satellite_score'] = df['MajSat'] + df['MinSat']
finite_mask = np.isfinite(df['satellite_score']) & np.isfinite(df['n_per_dist(um)']) & df['cell_type'].notna()
df_finite = df.loc[finite_mask, ['cell_id', 'cell_type', 'satellite_score', 'n_per_dist(um)', 'LaminB1']].copy()

# Assign within-cell high/low bins by quartile thresholds.
def assign_satellite_bins(input_df: pd.DataFrame, score_col: str = 'satellite_score') -> pd.DataFrame:
    parts = []
    for cell_id, g in input_df.groupby('cell_id', sort=False, observed=True):
        if len(g) < 4:
            continue
        q25, q75 = np.quantile(g[score_col].to_numpy(), [0.25, 0.75])
        low = g.loc[g[score_col] <= q25].copy()
        high = g.loc[g[score_col] >= q75].copy()
        low['satellite_bin'] = 'low'
        high['satellite_bin'] = 'high'
        parts.extend([low, high])
    if not parts:
        return input_df.iloc[0:0].assign(satellite_bin=pd.Series(dtype='object'))
    return pd.concat(parts, axis=0, ignore_index=True)

def median_difference_from_scores(input_df: pd.DataFrame, scores: np.ndarray | None = None) -> float:
    work = input_df.copy()
    if scores is not None:
        work['satellite_score'] = scores
    binned = assign_satellite_bins(work)
    high_dist = binned.loc[binned['satellite_bin'] == 'high', 'n_per_dist(um)'].to_numpy()
    low_dist = binned.loc[binned['satellite_bin'] == 'low', 'n_per_dist(um)'].to_numpy()
    if len(high_dist) == 0 or len(low_dist) == 0:
        return np.nan
    return float(np.median(high_dist) - np.median(low_dist))

binned_df = assign_satellite_bins(df_finite)
high_dist = binned_df.loc[binned_df['satellite_bin'] == 'high', 'n_per_dist(um)'].to_numpy()
low_dist = binned_df.loc[binned_df['satellite_bin'] == 'low', 'n_per_dist(um)'].to_numpy()
observed_statistic = float(np.median(high_dist) - np.median(low_dist))
effect_size = observed_statistic

# Within-cell permutation: shuffle satellite scores within each cell, preserving cell-specific distances.
null_statistics = np.empty(N_PERMUTATIONS, dtype=float)
cell_indices = [idx.to_numpy() for _, idx in df_finite.groupby('cell_id', sort=False, observed=True).groups.items()]
base_scores = df_finite['satellite_score'].to_numpy().copy()
for i in range(N_PERMUTATIONS):
    perm_scores = base_scores.copy()
    for idx in cell_indices:
        perm_scores[idx] = rng.permutation(perm_scores[idx])
    null_statistics[i] = median_difference_from_scores(df_finite, perm_scores)
null_statistics = null_statistics[np.isfinite(null_statistics)]

if len(null_statistics) >= 100 and len(high_dist) >= 500 and len(low_dist) >= 500:
    # One-sided p-value for the pre-specified negative direction: high-minus-low < 0.
    p_value = float((np.sum(null_statistics <= observed_statistic) + 1) / (len(null_statistics) + 1))
    hypothesis_test_status = 'pass'
else:
    p_value = float('nan')
    hypothesis_test_status = 'insufficient_data'

test_method = f'within-cell satellite-score permutation test ({len(null_statistics)} permutations, one-sided negative)'
null_hypothesis = 'Within each cell, satellite score is exchangeable with lamina/periphery distance; the high-minus-low median distance difference is no more negative than expected by chance.'
alternative_hypothesis = 'Satellite-high spots have smaller lamina/periphery distance than satellite-low spots, producing a negative high-minus-low median distance difference.'

# Summaries for audit table; global p-value is reported on the pooled primary statistic.
rows = []
rows.append({
    'stratum': 'pooled_all_cells',
    'cell_type': 'all',
    'n_cells': int(df_finite['cell_id'].nunique()),
    'n_finite_spots': int(len(df_finite)),
    'n_satellite_high': int(len(high_dist)),
    'n_satellite_low': int(len(low_dist)),
    'median_distance_high_um': float(np.median(high_dist)),
    'median_distance_low_um': float(np.median(low_dist)),
    'observed_statistic_high_minus_low_um': observed_statistic,
    'effect_size_um': effect_size,
    'p_value': p_value,
    'test_method': test_method,
})
for cell_type, g in binned_df.groupby('cell_type', sort=True, observed=True):
    gh = g.loc[g['satellite_bin'] == 'high', 'n_per_dist(um)'].to_numpy()
    gl = g.loc[g['satellite_bin'] == 'low', 'n_per_dist(um)'].to_numpy()
    rows.append({
        'stratum': f'cell_type_{cell_type}',
        'cell_type': str(cell_type),
        'n_cells': int(df_finite.loc[df_finite['cell_type'] == cell_type, 'cell_id'].nunique()),
        'n_finite_spots': int((df_finite['cell_type'] == cell_type).sum()),
        'n_satellite_high': int(len(gh)),
        'n_satellite_low': int(len(gl)),
        'median_distance_high_um': float(np.median(gh)),
        'median_distance_low_um': float(np.median(gl)),
        'observed_statistic_high_minus_low_um': float(np.median(gh) - np.median(gl)),
        'effect_size_um': float(np.median(gh) - np.median(gl)),
        'p_value': p_value,
        'test_method': test_method,
    })
result_table = pd.DataFrame(rows)
result_table.to_csv(RESULT_PATH, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'n_rows': int(len(df_finite)),
    'n_selected_cells': int(df_finite['cell_id'].nunique()),
    'n_satellite_high': int(len(high_dist)),
    'n_satellite_low': int(len(low_dist)),
    'parameter_name': 'pooled median n_per_dist(um), satellite-high minus satellite-low',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_permutations': int(len(null_statistics)),
    'expected_direction': 'negative high-minus-low difference',
    'result_path': str(RESULT_PATH),
    'statistical_figure_path': str(FIGURE_PATH),
    'notes': ['Quartile bins were assigned within each cell; permutation shuffled satellite scores within cells.'],
}

# Statistical figure: null distribution plus observed statistic and group comparison.
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.5), facecolor='white')
ax = axes[0]
ax.hist(null_statistics, bins=35, color='#c7dcef', edgecolor='white', label='within-cell shuffled null')
ax.axvline(observed_statistic, color='#b2182b', linewidth=2.5, label='observed high-low')
ax.axvline(0, color='black', linewidth=1, linestyle=':', label='zero difference')
ax.set_xlabel('Median distance difference (high - low, µm)')
ax.set_ylabel('Permutation count')
ax.set_title('Permutation evidence for lamina association')
ax.legend(frameon=False, fontsize=8)
ax.text(0.02, 0.98,
        f'p = {p_value:.4g}\neffect = {effect_size:.4g} µm\nn_high = {len(high_dist):,}\nn_low = {len(low_dist):,}\n{len(null_statistics)} permutations',
        transform=ax.transAxes, va='top', ha='left', fontsize=9,
        bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='0.8'))

ax = axes[1]
plot_data = [low_dist, high_dist]
box = ax.boxplot(plot_data, labels=['satellite-low', 'satellite-high'], patch_artist=True,
                 showfliers=False, medianprops=dict(color='black', linewidth=1.5))
for patch, color in zip(box['boxes'], ['#67a9cf', '#ef8a62']):
    patch.set_facecolor(color)
    patch.set_alpha(0.75)
# Overlay deterministic downsampled points for visibility without plotting all rows.
for pos, vals, color in [(1, low_dist, '#2166ac'), (2, high_dist, '#b2182b')]:
    n_show = min(800, len(vals))
    sample = rng.choice(vals, size=n_show, replace=False) if len(vals) > n_show else vals
    jitter = rng.normal(0, 0.035, size=len(sample))
    ax.scatter(np.full(len(sample), pos) + jitter, sample, s=4, alpha=0.18, color=color, linewidths=0)
ax.set_ylabel('Nuclear periphery distance, n_per_dist (µm)')
ax.set_title('Within-cell quartile groups')
ax.grid(axis='y', alpha=0.25)

fig.suptitle('Observed satellite-high spots have larger periphery distance in this subset', y=1.02, fontsize=12)
fig.tight_layout()
fig.savefig(FIGURE_PATH, dpi=180, bbox_inches='tight')
display(fig)
plt.close(fig)

print(json.dumps(analysis_summary, indent=2))
display(result_table)

Figure(1150x450)
{
  "idea_id": "lamina-association-of-repetitive-satellite-rich--6f2c6827ff",
  "n_rows": 56036,
  "n_selected_cells": 9,
  "n_satellite_high": 14014,
  "n_satellite_low": 14016,
  "parameter_name": "pooled median n_per_dist(um), satellite-high minus satellite-low",
  "parameter_value": 0.7121084651239675,
  "observed_statistic": 0.7121084651239675,
  "effect_size": 0.7121084651239675,
  "p_value": 1.0,
  "test_method": "within-cell satellite-score permutation test (500 permutations, one-sided negative)",
  "null_hypothesis": "Within each cell, satellite score is exchangeable with lamina/periphery distance; the high-minus-low median distance difference is no more negative than expected by chance.",
  "alternative_hypothesis": "Satellite-high spots have smaller lamina/periphery distance than satellite-low spots, producing a negative high-minus-low median distance difference.",
  "hypothesis_test_status": "pass",
  "n_permutations": 500,
  "expected_direction": "negative high-minus-low difference",
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png",
  "notes": [
    "Quartile bins were assigned within each cell; permutation shuffled satellite scores within cells."
  ]
}
              stratum  ...                                        test_method
0    pooled_all_cells  ...  within-cell satellite-score permutation test (...
1  cell_type_Bergmann  ...  within-cell satellite-score permutation test (...
2   cell_type_Granule  ...  within-cell satellite-score permutation test (...
3  cell_type_Purkinje  ...  within-cell satellite-score permutation test (...

[4 rows x 12 columns]

# Ensure outputs are present at the runner-requested workspace-relative paths.
from pathlib import Path
import shutil, os
print('cwd before output path check:', os.getcwd())
requested_dir = Path('/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
requested_dir.mkdir(parents=True, exist_ok=True)
requested_fig = requested_dir / 'lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png'
requested_csv = requested_dir / 'lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv'
print('FIGURE_PATH:', FIGURE_PATH, 'resolved:', Path(FIGURE_PATH).resolve(), 'exists:', Path(FIGURE_PATH).exists())
print('RESULT_PATH:', RESULT_PATH, 'resolved:', Path(RESULT_PATH).resolve(), 'exists:', Path(RESULT_PATH).exists())
if Path(FIGURE_PATH).exists() and Path(FIGURE_PATH).resolve() != requested_fig.resolve():
    shutil.copy2(Path(FIGURE_PATH), requested_fig)
if Path(RESULT_PATH).exists() and Path(RESULT_PATH).resolve() != requested_csv.resolve():
    shutil.copy2(Path(RESULT_PATH), requested_csv)
# Update summary to the required relative paths after ensuring copies exist.
analysis_summary['statistical_figure_path'] = 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png'
analysis_summary['result_path'] = 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv'
print('requested figure exists:', requested_fig.exists(), requested_fig)
print('requested csv exists:', requested_csv.exists(), requested_csv)

cwd before output path check: /Users/weizexu/Projects/U-Chrom
FIGURE_PATH: tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png resolved: /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png exists: True
RESULT_PATH: tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv resolved: /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv exists: True
requested figure exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_statistical_summary.png
requested csv exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "lamina-association-of-repetitive-satellite-rich--6f2c6827ff",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_at_least_9_total_and_at_least_3_per_cell_type": "not_run",
    "minimum_spot_or_trace_count_at_least_500_spots_per_satellite_bin": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_within_cell_permutation_p_value": "not_run",
    "runtime_under_budget_5_minutes": "not_run",
    "deterministic_rerun_fixed_seed": "not_run",
    "negative_control_or_permutation_shuffle_satellite_score_within_cell": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": 0.7121084651239675,
  "p_value": 1.0,
  "test_method": "within-cell satellite-score permutation test (500 permutations, one-sided negative)",
  "effect_size": 0.7121084651239675,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/lamina-association-of-repetitive-satellite-rich--6f2c6827ff_result.csv",
  "notes": [
    "Quartile bins were assigned within each cell; permutation shuffled satellite scores within cells."
  ]
}

Auto-discovery idea: Lamina association of repetitive satellite-rich chromatin¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶