import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Chromosome-specific peripheral positioning by LaminB1', 'biological_hypothesis': 'Chromosomes with stronger LaminB1-associated signal are positioned more peripherally, revealing chromosome-specific lamina-linked nuclear organization.', 'computable_parameter': 'LaminB1_chromosome_peripheral_slope = within-cell regression slope relating chromosome-level median tracks.LaminB1 to chromosome-level median tracks.n_per_dist(um), summarized across cells.', 'analysis_plan': 'Within each cell, aggregate spots by spots.chrom and compute chromosome-level median tracks.LaminB1 and median tracks.n_per_dist(um). Fit a simple robust or ordinary least-squares regression of median n_per_dist(um) on median LaminB1 across chromosomes per cell, then summarize slopes across cells. Test whether slopes differ from zero using a two-sided signed-rank test or a chromosome-label permutation test; use permutation of LaminB1 values across chromosomes within each cell as a negative control.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nDifferent chromosomes may vary in their association with the nuclear lamina, producing chromosome-specific radial organization.\n\n### Data used\nUse coordinates, chromosome labels, cell IDs, cell type metadata, LaminB1 signal, and nuclear radial/peripheral distance tracks.\n\n### Analysis sketch\nFor each chromosome in each cell, summarize LaminB1 signal and radial/peripheral position, then ask whether chromosomes with stronger LaminB1 signal are more peripheral.\n\n### Expected result\nIf lamina association drives chromosome-specific positioning, chromosomes with higher LaminB1 should show stronger peripheral positioning.\n\n### Validation checks\nRequire field existence, enough chromosomes and cells, finite slope, regression or permutation p-value, runtime under budget, deterministic rerun, and chromosome-label or LaminB1 permutation as a negative control.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['spots.chrom', 'spots.cell_id', 'tracks.LaminB1', 'tracks.n_per_dist(um)', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'Nonzero, expected positive LaminB1_chromosome_peripheral_slope if larger n_per_dist(um) denotes greater peripheral distance in this dataset; direction should be confirmed by track definition during validation.', 'complexity': 2, 'idea_id': 'chromosome-specific-peripheral-positioning-by-la-8abdde15dc', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

chromosome-specific-peripheral-positioning-by-la-8abdde15dc
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight field and coverage inspection before the main analysis
import numpy as np
import pandas as pd

spots_preview = cdata.spots[['chrom', 'cell_id']].copy()
tracks_preview = cdata.tracks[['LaminB1', 'n_per_dist(um)']].copy()
coverage = pd.DataFrame({
    'field': ['spots.chrom', 'spots.cell_id', 'tracks.LaminB1', 'tracks.n_per_dist(um)', 'cells.cell_type'],
    'available': [
        'chrom' in cdata.spots.columns,
        'cell_id' in cdata.spots.columns,
        'LaminB1' in cdata.tracks.columns,
        'n_per_dist(um)' in cdata.tracks.columns,
        'cell_type' in cdata.cells.columns,
    ],
    'finite_or_nonmissing_count': [
        cdata.spots['chrom'].notna().sum(),
        cdata.spots['cell_id'].notna().sum(),
        np.isfinite(cdata.tracks['LaminB1']).sum(),
        np.isfinite(cdata.tracks['n_per_dist(um)']).sum(),
        cdata.cells['cell_type'].notna().sum(),
    ]
})
print('spots shape:', cdata.spots.shape, 'tracks shape:', cdata.tracks.shape, 'cells shape:', cdata.cells.shape)
print('unique cells:', cdata.spots['cell_id'].nunique(), 'unique chromosomes:', cdata.spots['chrom'].nunique())
print('cell types:', cdata.cells['cell_type'].value_counts().to_dict())
display(coverage)
display(pd.concat([spots_preview.head(8).reset_index(drop=True), tracks_preview.head(8).reset_index(drop=True)], axis=1))

spots shape: (56036, 6) tracks shape: (56036, 62) cells shape: (9, 10)
unique cells: 9 unique chromosomes: 20
cell types: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
                   field  available  finite_or_nonmissing_count
0            spots.chrom       True                       56036
1          spots.cell_id       True                       56036
2         tracks.LaminB1       True                       56036
3  tracks.n_per_dist(um)       True                       56036
4        cells.cell_type       True                           9
   chrom cell_id  LaminB1  n_per_dist(um)
0  chr14  1_0_61 -1.46070    1.812949e-01
1   chr2  1_0_61 -0.92373    5.474538e-02
2  chr14  1_0_61 -1.53310    0.000000e+00
3   chr2  1_0_61 -1.14700   -8.881784e-16
4  chr14  1_0_61 -1.10470    3.141663e-02
5   chr2  1_0_61 -1.27970    6.554444e-02
6   chr6  1_0_61 -1.15900    1.290660e-01
7   chr3  1_0_61 -1.65370   -8.881784e-16

# Main compact exploration: chromosome-level LaminB1 vs peripheral distance with permutation test
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
import numpy as np
import pandas as pd
import json
from pathlib import Path

rng = np.random.default_rng(20250214)
n_permutations = 500
result_path = RUN_OUTPUT_DIR / 'chromosome-specific-peripheral-positioning-by-la-8abdde15dc_result.csv'
figure_path = RUN_OUTPUT_DIR / 'chromosome-specific-peripheral-positioning-by-la-8abdde15dc_statistical_summary.png'

# Build a spot-level table only with required fields; rows are aligned between spots and tracks.
df = pd.DataFrame({
    'cell_id': cdata.spots['cell_id'].astype(str).to_numpy(),
    'chrom': cdata.spots['chrom'].astype(str).to_numpy(),
    'LaminB1': pd.to_numeric(cdata.tracks['LaminB1'], errors='coerce').to_numpy(),
    'n_per_dist_um': pd.to_numeric(cdata.tracks['n_per_dist(um)'], errors='coerce').to_numpy(),
})
df = df[np.isfinite(df['LaminB1']) & np.isfinite(df['n_per_dist_um'])].copy()

# Attach cell type if possible; ChromData cells are indexed by cell_id in this subset.
cell_type_map = cdata.cells['cell_type'].astype(str).to_dict()
df['cell_type'] = df['cell_id'].map(cell_type_map).fillna('unknown')

# Chromosome-by-cell summaries: the computable parameter is based on medians.
chrom_summary = (
    df.groupby(['cell_id', 'cell_type', 'chrom'], observed=True)
      .agg(median_LaminB1=('LaminB1', 'median'),
           median_n_per_dist_um=('n_per_dist_um', 'median'),
           n_spots=('chrom', 'size'))
      .reset_index()
)

# Per-cell OLS slope and Pearson r across chromosome medians. Require enough chromosomes and variation.
def slope_for_group(g):
    x = g['median_LaminB1'].to_numpy(float)
    y = g['median_n_per_dist_um'].to_numpy(float)
    ok = np.isfinite(x) & np.isfinite(y)
    x = x[ok]; y = y[ok]
    if len(x) < 4 or np.nanstd(x) == 0 or np.nanstd(y) == 0:
        return np.nan, np.nan, len(x)
    slope, intercept = np.polyfit(x, y, 1)
    r = float(np.corrcoef(x, y)[0, 1])
    return float(slope), r, len(x)

cell_rows = []
for (cell_id, cell_type), g in chrom_summary.groupby(['cell_id', 'cell_type'], observed=True):
    slope, corr, n_chrom = slope_for_group(g)
    cell_rows.append({
        'cell_id': cell_id,
        'cell_type': cell_type,
        'n_chromosomes': int(n_chrom),
        'n_spots': int(g['n_spots'].sum()),
        'LaminB1_chromosome_peripheral_slope': slope,
        'chromosome_median_pearson_r': corr,
    })
cell_slopes = pd.DataFrame(cell_rows)
valid_slopes = cell_slopes['LaminB1_chromosome_peripheral_slope'].dropna().to_numpy(float)

if len(valid_slopes) >= 2:
    observed_statistic = float(np.mean(valid_slopes))
    # Negative control: shuffle chromosome-level LaminB1 labels within each cell, recompute mean slope.
    null_distribution = np.empty(n_permutations, dtype=float)
    grouped = list(chrom_summary.groupby(['cell_id', 'cell_type'], observed=True))
    for b in range(n_permutations):
        permuted_cell_slopes = []
        for _, g in grouped:
            x = g['median_LaminB1'].to_numpy(float).copy()
            y = g['median_n_per_dist_um'].to_numpy(float)
            ok = np.isfinite(x) & np.isfinite(y)
            x = x[ok]; y = y[ok]
            if len(x) >= 4 and np.nanstd(x) > 0 and np.nanstd(y) > 0:
                x_perm = rng.permutation(x)
                permuted_cell_slopes.append(float(np.polyfit(x_perm, y, 1)[0]))
        null_distribution[b] = np.mean(permuted_cell_slopes) if permuted_cell_slopes else np.nan
    null_distribution = null_distribution[np.isfinite(null_distribution)]
    p_value = float((np.sum(np.abs(null_distribution) >= abs(observed_statistic)) + 1) / (len(null_distribution) + 1))
    hypothesis_test_status = 'pass'
    null_mean = float(np.mean(null_distribution))
    null_sd = float(np.std(null_distribution, ddof=1)) if len(null_distribution) > 1 else float('nan')
    permutation_z = float((observed_statistic - null_mean) / null_sd) if np.isfinite(null_sd) and null_sd > 0 else float('nan')
else:
    observed_statistic = float(np.mean(valid_slopes)) if len(valid_slopes) else 0.0
    null_distribution = np.array([0.0])
    p_value = 1.0
    hypothesis_test_status = 'insufficient_data'
    null_mean = 0.0
    null_sd = float('nan')
    permutation_z = float('nan')

# Global p-value/test method repeated per-cell so result_table is self-contained and verifier-friendly.
test_method = f'within-cell chromosome-label permutation test ({len(null_distribution)} permutations, two-sided mean slope)'
result_table = cell_slopes.copy()
result_table['observed_statistic_mean_slope'] = observed_statistic
result_table['effect_size'] = observed_statistic
result_table['permutation_z'] = permutation_z
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'LaminB1_chromosome_peripheral_slope',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': observed_statistic,
    'effect_size_units': 'um n_per_dist per LaminB1 track unit',
    'permutation_z': permutation_z,
    'p_value': p_value,
    'test_method': test_method,
    'hypothesis_test_status': hypothesis_test_status,
    'null_hypothesis': 'Within cells, chromosome-level median LaminB1 is exchangeable across chromosomes and the mean within-cell slope linking LaminB1 to n_per_dist(um) is no different from the shuffled-label null.',
    'alternative_hypothesis': 'Chromosomes with different median LaminB1 have systematically different peripheral distance, producing a nonzero mean within-cell slope.',
    'n_selected_cells': int(len(result_table)),
    'n_valid_cells_for_test': int(len(valid_slopes)),
    'n_rows': int(len(result_table)),
    'n_spots': int(len(df)),
    'n_chromosome_cell_groups': int(len(chrom_summary)),
    'n_permutations': int(len(null_distribution)),
    'result_path': str(result_path),
    'figure_path': str(figure_path),
    'notes': [
        'Positive slope means larger median n_per_dist(um) for chromosomes with higher median LaminB1; interpretation of peripheral direction follows the dataset track definition.',
        'Negative control permutes LaminB1 chromosome labels within each cell, preserving per-cell chromosome-distance values.'
    ],
}

# Statistical figure: observed per-cell slopes and observed mean versus null distribution.
plt.rcParams.update({'figure.facecolor': 'white', 'axes.facecolor': 'white', 'font.size': 10})
fig, axes = plt.subplots(1, 2, figsize=(10.5, 4.2), constrained_layout=True)

# Left: per-cell slopes by cell type with mean line.
plot_df = result_table.sort_values(['cell_type', 'cell_id']).reset_index(drop=True)
colors = {'Granule': '#4C78A8', 'Bergmann': '#F58518', 'Purkinje': '#54A24B', 'unknown': '#777777'}
for i, row in plot_df.iterrows():
    axes[0].scatter(i, row['LaminB1_chromosome_peripheral_slope'], s=52,
                    color=colors.get(row['cell_type'], '#777777'), edgecolor='black', linewidth=0.4,
                    label=row['cell_type'] if row['cell_type'] not in axes[0].get_legend_handles_labels()[1] else None)
axes[0].axhline(0, color='0.55', linewidth=1, linestyle='--', label='zero slope')
axes[0].axhline(observed_statistic, color='black', linewidth=1.6, label='mean observed slope')
axes[0].set_xticks(range(len(plot_df)))
axes[0].set_xticklabels(plot_df['cell_id'], rotation=45, ha='right')
axes[0].set_ylabel('Within-cell slope\nmedian n_per_dist (um) / median LaminB1')
axes[0].set_xlabel('Cell')
axes[0].set_title('Cell-level chromosome slopes')
axes[0].legend(frameon=False, fontsize=8)

# Right: permutation null distribution of mean slopes.
axes[1].hist(null_distribution, bins=28, color='#D9E2EF', edgecolor='#4C78A8', linewidth=0.8, label='shuffled LaminB1 labels')
axes[1].axvline(observed_statistic, color='#B22222', linewidth=2.0, label='observed mean slope')
axes[1].axvline(-abs(observed_statistic), color='#B22222', linewidth=1.0, linestyle=':', label='two-sided threshold')
axes[1].axvline(abs(observed_statistic), color='#B22222', linewidth=1.0, linestyle=':')
axes[1].set_xlabel('Mean within-cell slope under null')
axes[1].set_ylabel('Permutation count')
axes[1].set_title('Chromosome-label permutation evidence')
annot = f"p={p_value:.4f}\neffect={observed_statistic:.4g} um/unit\nn cells={len(valid_slopes)}\n{len(null_distribution)} permutations"
axes[1].text(0.98, 0.95, annot, transform=axes[1].transAxes, va='top', ha='right',
             bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='0.7'))
axes[1].legend(frameon=False, fontsize=8)
fig.suptitle('Chromosome-specific peripheral positioning by LaminB1', fontsize=12)
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
plt.show()
display(Image(filename=str(figure_path)))

print(json.dumps(analysis_summary, indent=2))
display(result_table)

<IPython.core.display.Image object>
{
  "idea_id": "chromosome-specific-peripheral-positioning-by-la-8abdde15dc",
  "parameter_name": "LaminB1_chromosome_peripheral_slope",
  "parameter_value": -0.2679491094600665,
  "observed_statistic": -0.2679491094600665,
  "effect_size": -0.2679491094600665,
  "effect_size_units": "um n_per_dist per LaminB1 track unit",
  "permutation_z": -2.604494674656123,
  "p_value": 0.005988023952095809,
  "test_method": "within-cell chromosome-label permutation test (500 permutations, two-sided mean slope)",
  "hypothesis_test_status": "pass",
  "null_hypothesis": "Within cells, chromosome-level median LaminB1 is exchangeable across chromosomes and the mean within-cell slope linking LaminB1 to n_per_dist(um) is no different from the shuffled-label null.",
  "alternative_hypothesis": "Chromosomes with different median LaminB1 have systematically different peripheral distance, producing a nonzero mean within-cell slope.",
  "n_selected_cells": 9,
  "n_valid_cells_for_test": 9,
  "n_rows": 9,
  "n_spots": 56036,
  "n_chromosome_cell_groups": 164,
  "n_permutations": 500,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/chromosome-specific-peripheral-positioning-by-la-8abdde15dc_result.csv",
  "figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/chromosome-specific-peripheral-positioning-by-la-8abdde15dc_statistical_summary.png",
  "notes": [
    "Positive slope means larger median n_per_dist(um) for chromosomes with higher median LaminB1; interpretation of peripheral direction follows the dataset track definition.",
    "Negative control permutes LaminB1 chromosome labels within each cell, preserving per-cell chromosome-distance values."
  ]
}
   cell_id  ... hypothesis_test_status
0  1_0_116  ...                   pass
1   1_0_26  ...                   pass
2   1_0_34  ...                   pass
3   1_0_37  ...                   pass
4   1_0_42  ...                   pass
5   1_0_47  ...                   pass
6   1_0_61  ...                   pass
7   1_0_63  ...                   pass
8   1_0_69  ...                   pass

[9 rows x 12 columns]

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/chromosome-specific-peripheral-positioning-by-la-8abdde15dc.ipynb:169: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  ]

# Ensure expected artifact paths exist relative to the workspace root for external audit tools.
from pathlib import Path
import shutil, os
workspace_root = Path('/Users/weizexu/Projects/U-Chrom')
expected_dir = workspace_root / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg'
expected_dir.mkdir(parents=True, exist_ok=True)
expected_result = expected_dir / 'chromosome-specific-peripheral-positioning-by-la-8abdde15dc_result.csv'
expected_figure = expected_dir / 'chromosome-specific-peripheral-positioning-by-la-8abdde15dc_statistical_summary.png'
current_result = Path(analysis_summary['result_path'])
current_figure = Path(analysis_summary['figure_path'])
for src, dst in [(current_result, expected_result), (current_figure, expected_figure)]:
    if src.exists() and src.resolve() != dst.resolve():
        shutil.copy2(src, dst)
    elif not dst.exists() and Path(src.name).exists():
        shutil.copy2(Path(src.name), dst)
analysis_summary['result_path'] = str(expected_result.relative_to(workspace_root))
analysis_summary['figure_path'] = str(expected_figure.relative_to(workspace_root))
# Keep the CSV in sync with the final expected location.
result_table.to_csv(expected_result, index=False)
print('cwd:', os.getcwd())
print('result exists:', expected_result.exists(), expected_result)
print('figure exists:', expected_figure.exists(), expected_figure)

cwd: /Users/weizexu/Projects/U-Chrom
result exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/chromosome-specific-peripheral-positioning-by-la-8abdde15dc_result.csv
figure exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/chromosome-specific-peripheral-positioning-by-la-8abdde15dc_statistical_summary.png

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "chromosome-specific-peripheral-positioning-by-la-8abdde15dc",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test": "pass",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass"
  },
  "parameter_value": -0.2679491094600665,
  "p_value": 0.005988023952095809,
  "test_method": "within-cell chromosome-label permutation test (500 permutations, two-sided mean slope)",
  "effect_size": -0.2679491094600665,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/chromosome-specific-peripheral-positioning-by-la-8abdde15dc_result.csv",
  "notes": [
    "Positive slope means larger median n_per_dist(um) for chromosomes with higher median LaminB1; interpretation of peripheral direction follows the dataset track definition.",
    "Negative control permutes LaminB1 chromosome labels within each cell, preserving per-cell chromosome-distance values."
  ]
}

Auto-discovery idea: Chromosome-specific peripheral positioning by LaminB1¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶