# Ensure scaffold relative paths resolve from the U-Chrom workspace root.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd', os.getcwd())

cwd /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'H3K27ac-high loci resist spurious compaction calls after marker permutation', 'biological_hypothesis': 'H3K27ac-enriched chromatin bins have larger local 3D separation from neighboring genomic bins than H3K27ac-depleted bins, and this effect is robust to within-trace negative controls.', 'computable_parameter': 'h3k27ac_opening_delta_um: median adjacent-bin 3D distance at top-quartile H3K27ac spots minus median adjacent-bin 3D distance at bottom-quartile H3K27ac spots, aggregated across eligible trace-chromosome groups.', 'analysis_plan': 'Sort spots by spots.cell_id, spots.trace_id, spots.chrom, and spots.start. For each trace-chromosome group with enough ordered spots, compute adjacent 3D distances from coords and assign distances to bins by local H3K27ac quartile from tracks.H3K27ac. Aggregate the top-minus-bottom median distance difference across groups and test whether it is greater than zero using a paired sign-flip or Wilcoxon test. Repeat after permuting tracks.H3K27ac within each trace as a negative control and require the observed parameter to exceed the permutation null.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nActive chromatin marks should not appear compact merely because of sampling density or trace identity. This idea asks whether H3K27ac-high spots are consistently more open than H3K27ac-low spots within matched chromosome traces.\n\n### Data used\nUse 3D spot coordinates, spot trace/chromosome/bin annotations, cell type labels, and the H3K27ac IF track across Granule, Bergmann, and Purkinje cells.\n\n### Analysis sketch\nWithin each trace and chromosome, rank spots by H3K27ac and compute the median adjacent-bin 3D distance among top-quartile versus bottom-quartile H3K27ac spots. The single reported parameter is the top-minus-bottom distance difference aggregated across eligible traces.\n\n### Expected result\nIf active acetylated regions are more spatially open, the parameter should be positive and remain positive within cell-type strata.\n\n### Validation checks\nRequire field existence, enough cells and traces, finite output, a paired permutation or Wilcoxon test with p-value, fixed random seed rerun, runtime budget, and a negative control that permutes H3K27ac values within each trace.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.cell_id', 'spots.trace_id', 'spots.chrom', 'spots.start', 'tracks.H3K27ac', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'h3k27ac_opening_delta_um should be positive in the real data and near zero after within-trace H3K27ac permutation.', 'complexity': 3, 'idea_id': 'h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection for required fields and alignment assumptions.
import numpy as np
import pandas as pd

spots_preview = cdata.spots[['cell_id', 'trace_id', 'chrom', 'start']].head(5).copy()
cell_type_counts = cdata.cells['cell_type'].value_counts().sort_index().to_dict()
track_names = list(cdata.tracks.columns) if hasattr(cdata.tracks, 'columns') else list(getattr(cdata, 'tracks', {}).keys())
h3 = np.asarray(cdata.tracks['H3K27ac'], dtype=float)
coords_arr = np.asarray(cdata.coords, dtype=float)
finite_mask = np.isfinite(h3) & np.all(np.isfinite(coords_arr), axis=1)
inspection_summary = {
    'n_spots': int(len(cdata.spots)),
    'n_traces': int(cdata.spots['trace_id'].nunique()),
    'n_cells': int(len(cdata.cells)),
    'cell_type_counts': cell_type_counts,
    'has_H3K27ac': 'H3K27ac' in track_names,
    'coords_shape': tuple(coords_arr.shape),
    'finite_h3k27ac_and_coords_fraction': float(finite_mask.mean()),
    'n_trace_chrom_groups': int(cdata.spots.groupby(['trace_id', 'chrom']).ngroups),
}
print(json.dumps(inspection_summary, indent=2))
display(spots_preview)

{
  "n_spots": 56036,
  "n_traces": 213,
  "n_cells": 9,
  "cell_type_counts": {
    "Bergmann": 3,
    "Granule": 3,
    "Purkinje": 3
  },
  "has_H3K27ac": true,
  "coords_shape": [
    56036,
    3
  ],
  "finite_h3k27ac_and_coords_fraction": 1.0,
  "n_trace_chrom_groups": 213
}
  cell_id         trace_id  chrom      start
0  1_0_61  1_0_61_chr14_a2  chr14   30425000
1  1_0_61   1_0_61_chr2_a2   chr2   99725000
2  1_0_61  1_0_61_chr14_a1  chr14   39625000
3  1_0_61   1_0_61_chr2_a2   chr2   99950000
4  1_0_61  1_0_61_chr14_a1  chr14  108450000

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89.ipynb:19: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  "### Expected result\n",

# Main compact exploration: adjacent-bin opening at H3K27ac-high versus H3K27ac-low loci.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import display

rng = np.random.default_rng(20250214)
result_path = RUN_OUTPUT_DIR / 'h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_result.csv'
figure_path = RUN_OUTPUT_DIR / 'h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png'

spots = cdata.spots[['cell_id', 'trace_id', 'chrom', 'start']].copy().reset_index(drop=True)
coords = np.asarray(cdata.coords, dtype=float)
h3 = np.asarray(cdata.tracks['H3K27ac'], dtype=float)
cell_type_lookup = cdata.cells['cell_type'].astype(str).to_dict()
spots['cell_type'] = spots['cell_id'].map(cell_type_lookup).astype(str)
spots['_idx'] = np.arange(len(spots))

# Precompute per-trace/chromosome adjacent distances and upstream-bin H3K27ac values.
group_records = []
min_adjacent_edges = 8
for (cell_id, trace_id, chrom), sub in spots.groupby(['cell_id', 'trace_id', 'chrom'], observed=True, sort=False):
    sub = sub.sort_values('start')
    idx = sub['_idx'].to_numpy()
    if idx.size < (min_adjacent_edges + 1):
        continue
    xyz = coords[idx]
    h3_local = h3[idx]
    finite_vertices = np.all(np.isfinite(xyz), axis=1) & np.isfinite(h3_local)
    if finite_vertices.sum() < (min_adjacent_edges + 1):
        continue
    xyz = xyz[finite_vertices]
    h3_local = h3_local[finite_vertices]
    starts = sub['start'].to_numpy()[finite_vertices]
    if xyz.shape[0] < (min_adjacent_edges + 1):
        continue
    d = np.linalg.norm(np.diff(xyz, axis=0), axis=1)
    h_up = h3_local[:-1]
    edge_finite = np.isfinite(d) & np.isfinite(h_up)
    d = d[edge_finite]
    h_up = h_up[edge_finite]
    if d.size < min_adjacent_edges or np.unique(h_up).size < 4:
        continue
    group_records.append({
        'cell_id': str(cell_id),
        'trace_id': str(trace_id),
        'chrom': str(chrom),
        'cell_type': str(cell_type_lookup.get(cell_id, 'unknown')),
        'n_edges': int(d.size),
        'dist_um': d,
        'h3': h_up,
    })

def high_low_delta(h_vals, dist_vals):
    """Median adjacent distance at top-quartile H3K27ac minus bottom-quartile H3K27ac."""
    h_vals = np.asarray(h_vals, dtype=float)
    dist_vals = np.asarray(dist_vals, dtype=float)
    ok = np.isfinite(h_vals) & np.isfinite(dist_vals)
    h_vals = h_vals[ok]
    dist_vals = dist_vals[ok]
    if h_vals.size < min_adjacent_edges or np.unique(h_vals).size < 4:
        return np.nan, np.nan, np.nan, 0, 0
    q25, q75 = np.quantile(h_vals, [0.25, 0.75])
    low = dist_vals[h_vals <= q25]
    high = dist_vals[h_vals >= q75]
    if low.size == 0 or high.size == 0:
        return np.nan, np.nan, np.nan, int(high.size), int(low.size)
    high_med = float(np.median(high))
    low_med = float(np.median(low))
    return high_med - low_med, high_med, low_med, int(high.size), int(low.size)

per_group_rows = []
for rec in group_records:
    delta, high_med, low_med, n_high, n_low = high_low_delta(rec['h3'], rec['dist_um'])
    if np.isfinite(delta):
        per_group_rows.append({
            'cell_id': rec['cell_id'],
            'trace_id': rec['trace_id'],
            'chrom': rec['chrom'],
            'cell_type': rec['cell_type'],
            'n_edges': rec['n_edges'],
            'n_high_edges': n_high,
            'n_low_edges': n_low,
            'high_median_adjacent_distance_um': high_med,
            'low_median_adjacent_distance_um': low_med,
            'delta_um': float(delta),
        })

per_group = pd.DataFrame(per_group_rows)
deltas = per_group['delta_um'].to_numpy(dtype=float) if len(per_group) else np.array([], dtype=float)
n_groups = int(np.isfinite(deltas).sum())

n_permutations = 500
if n_groups >= 5:
    observed_statistic = float(np.mean(deltas))
    # Primary negative-control test: permute H3K27ac values within each trace-chromosome group.
    null_distribution = np.empty(n_permutations, dtype=float)
    for p in range(n_permutations):
        perm_deltas = []
        for rec in group_records:
            perm_h = rng.permutation(rec['h3'])
            perm_delta, *_ = high_low_delta(perm_h, rec['dist_um'])
            if np.isfinite(perm_delta):
                perm_deltas.append(perm_delta)
        null_distribution[p] = np.mean(perm_deltas) if perm_deltas else np.nan
    null_distribution = null_distribution[np.isfinite(null_distribution)]
    p_value = float((1 + np.sum(null_distribution >= observed_statistic)) / (1 + len(null_distribution)))
    effect_size = float(observed_statistic - np.median(null_distribution))
    # Supplementary paired sign-flip test for whether group deltas are > 0.
    signflip_null = np.empty(n_permutations, dtype=float)
    for p in range(n_permutations):
        signs = rng.choice([-1.0, 1.0], size=n_groups)
        signflip_null[p] = np.mean(deltas * signs)
    signflip_p_value = float((1 + np.sum(signflip_null >= observed_statistic)) / (1 + n_permutations))
    hypothesis_test_status = 'pass'
else:
    observed_statistic = float(np.nanmean(deltas)) if deltas.size else 0.0
    null_distribution = np.array([0.0], dtype=float)
    p_value = 1.0
    effect_size = float(observed_statistic)
    signflip_p_value = 1.0
    hypothesis_test_status = 'insufficient_data'

cell_type_rows = []
for cell_type, sub in per_group.groupby('cell_type', observed=True):
    vals = sub['delta_um'].to_numpy(dtype=float)
    cell_type_rows.append({
        'scope': f'cell_type:{cell_type}',
        'cell_type': str(cell_type),
        'n_groups': int(len(vals)),
        'observed_statistic': float(np.mean(vals)) if len(vals) else np.nan,
        'median_group_delta_um': float(np.median(vals)) if len(vals) else np.nan,
    })

test_method = f'within-trace H3K27ac marker permutation test ({len(null_distribution)} permutations); supplementary paired sign-flip ({n_permutations})'
null_hypothesis = 'Within each trace-chromosome, H3K27ac labels are exchangeable with respect to adjacent-bin 3D distances; top-minus-bottom H3K27ac median distance is no larger than marker-permuted control.'
alternative_hypothesis = 'H3K27ac-high adjacent bins have larger local 3D separation than H3K27ac-low bins, producing a positive top-minus-bottom median distance beyond within-trace marker permutations.'

# Result table contains one overall inferential row plus compact cell-type descriptive strata.
overall_row = {
    'scope': 'overall',
    'cell_type': 'all',
    'n_groups': n_groups,
    'n_edges': int(per_group['n_edges'].sum()) if len(per_group) else 0,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'signflip_p_value': signflip_p_value,
    'test_method': test_method,
    'hypothesis_test_status': hypothesis_test_status,
    'null_median': float(np.median(null_distribution)) if len(null_distribution) else np.nan,
    'null_q025': float(np.quantile(null_distribution, 0.025)) if len(null_distribution) else np.nan,
    'null_q975': float(np.quantile(null_distribution, 0.975)) if len(null_distribution) else np.nan,
}
result_table = pd.concat([pd.DataFrame([overall_row]), pd.DataFrame(cell_type_rows)], ignore_index=True)
for col, value in [('p_value', p_value), ('signflip_p_value', signflip_p_value), ('test_method', test_method), ('hypothesis_test_status', hypothesis_test_status), ('effect_size', effect_size)]:
    if col not in result_table:
        result_table[col] = value
    else:
        result_table[col] = result_table[col].fillna(value)
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'h3k27ac_opening_delta_um',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'signflip_p_value': signflip_p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_selected_cells': int(per_group['cell_id'].nunique()) if len(per_group) else 0,
    'n_rows': n_groups,
    'n_edges': int(per_group['n_edges'].sum()) if len(per_group) else 0,
    'n_permutations': int(len(null_distribution)),
    'result_path': str(result_path),
    'statistical_figure_path': str(figure_path),
    'negative_control': 'H3K27ac values permuted within each eligible trace-chromosome group before reapplying quartile labels.',
    'notes': [
        'Adjacent distances are assigned to the upstream genomic bin after sorting by start.',
        'No all-vs-all distance matrix was computed; analysis is linear in spots plus bounded permutations.',
    ],
}

# Statistical figure: observed aggregate delta against marker-permutation null and group-level deltas by cell type.
plt.rcParams.update({'figure.facecolor': 'white', 'axes.facecolor': 'white', 'font.size': 10})
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), constrained_layout=True)
ax = axes[0]
ax.hist(null_distribution, bins=30, color='#b8c7dd', edgecolor='white', label='Within-trace H3K27ac permutations')
ax.axvline(observed_statistic, color='#b2182b', linewidth=2.2, label='Observed mean delta')
ax.axvline(np.median(null_distribution), color='#2166ac', linestyle='--', linewidth=1.6, label='Null median')
ax.set_xlabel('Mean top-minus-bottom adjacent distance delta (µm)')
ax.set_ylabel('Permutation count')
ax.set_title('Marker-permutation hypothesis test')
ax.legend(frameon=False, fontsize=8)
annotation = f'p={p_value:.4f}\neffect={effect_size:.3f} µm\nn={n_groups} groups\n{len(null_distribution)} permutations'
ax.text(0.98, 0.95, annotation, transform=ax.transAxes, va='top', ha='right', fontsize=9,
        bbox=dict(facecolor='white', edgecolor='0.7', boxstyle='round,pad=0.3'))

ax = axes[1]
order = sorted(per_group['cell_type'].unique()) if len(per_group) else []
positions = np.arange(len(order))
for pos, ct in zip(positions, order):
    vals = per_group.loc[per_group['cell_type'] == ct, 'delta_um'].to_numpy(dtype=float)
    jitter = rng.normal(0, 0.035, size=len(vals))
    ax.scatter(np.full(len(vals), pos) + jitter, vals, s=18, alpha=0.65, label=ct)
    if len(vals):
        ax.hlines(np.mean(vals), pos - 0.25, pos + 0.25, colors='black', linewidth=2)
ax.axhline(0, color='0.35', linewidth=1, linestyle='--')
ax.set_xticks(positions)
ax.set_xticklabels(order, rotation=20, ha='right')
ax.set_ylabel('Group top-minus-bottom median distance (µm)')
ax.set_title('Trace-chromosome paired deltas by cell type')
ax.legend(frameon=False, fontsize=8, loc='best')
fig.suptitle('H3K27ac-high loci resist spurious compaction calls', fontsize=12)
fig.savefig(figure_path, dpi=180, bbox_inches='tight')
display(fig)
plt.close(fig)

print(json.dumps(analysis_summary, indent=2))
display(result_table)
display(per_group.head(10))

Figure(1100x420)
{
  "idea_id": "h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89",
  "parameter_name": "h3k27ac_opening_delta_um",
  "parameter_value": -0.01583664958040238,
  "observed_statistic": -0.01583664958040238,
  "effect_size": -0.014128201304822752,
  "p_value": 0.874251497005988,
  "signflip_p_value": 0.8782435129740519,
  "test_method": "within-trace H3K27ac marker permutation test (500 permutations); supplementary paired sign-flip (500)",
  "null_hypothesis": "Within each trace-chromosome, H3K27ac labels are exchangeable with respect to adjacent-bin 3D distances; top-minus-bottom H3K27ac median distance is no larger than marker-permuted control.",
  "alternative_hypothesis": "H3K27ac-high adjacent bins have larger local 3D separation than H3K27ac-low bins, producing a positive top-minus-bottom median distance beyond within-trace marker permutations.",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 9,
  "n_rows": 213,
  "n_edges": 55823,
  "n_permutations": 500,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png",
  "negative_control": "H3K27ac values permuted within each eligible trace-chromosome group before reapplying quartile labels.",
  "notes": [
    "Adjacent distances are assigned to the upstream genomic bin after sorting by start.",
    "No all-vs-all distance matrix was computed; analysis is linear in spots plus bounded permutations."
  ]
}
                scope cell_type  ...  null_q975  median_group_delta_um
0             overall       all  ...   0.023377                    NaN
1  cell_type:Bergmann  Bergmann  ...        NaN               0.050727
2   cell_type:Granule   Granule  ...        NaN               0.009361
3  cell_type:Purkinje  Purkinje  ...        NaN              -0.046580

[4 rows x 14 columns]
  cell_id         trace_id  ... low_median_adjacent_distance_um  delta_um
0  1_0_61  1_0_61_chr14_a2  ...                        0.307461  0.050178
1  1_0_61   1_0_61_chr2_a2  ...                        0.352496  0.042439
2  1_0_61  1_0_61_chr14_a1  ...                        0.304759  0.107475
3  1_0_61   1_0_61_chr2_a1  ...                        0.426375 -0.066344
4  1_0_61   1_0_61_chr6_a2  ...                        0.311922  0.095652
5  1_0_61   1_0_61_chr3_a1  ...                        0.396360 -0.081514
6  1_0_61   1_0_61_chr3_a2  ...                        0.393918 -0.039834
7  1_0_61  1_0_61_chr11_a1  ...                        0.484469  0.116936
8  1_0_61  1_0_61_chr10_a1  ...                        1.244139 -0.623949
9  1_0_61  1_0_61_chr17_a1  ...                        0.568645 -0.000867

[10 rows x 10 columns]

# Ensure required result/figure artifacts exist at the exact requested workspace-relative paths.
from pathlib import Path
import shutil, os
required_dir = Path('/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
required_dir.mkdir(parents=True, exist_ok=True)
required_figure_path = required_dir / 'h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png'
required_result_path = required_dir / 'h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_result.csv'
# Copy from any accidental relative location if needed; otherwise save/write directly.
if Path(analysis_summary['statistical_figure_path']).exists() and Path(analysis_summary['statistical_figure_path']).resolve() != required_figure_path.resolve():
    shutil.copy2(Path(analysis_summary['statistical_figure_path']), required_figure_path)
elif Path('h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png').exists():
    shutil.copy2('h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png', required_figure_path)
if 'result_table' in globals():
    result_table.to_csv(required_result_path, index=False)
analysis_summary['statistical_figure_path'] = str(required_figure_path.relative_to('/Users/weizexu/Projects/U-Chrom'))
analysis_summary['result_path'] = str(required_result_path.relative_to('/Users/weizexu/Projects/U-Chrom'))
print('figure exists:', required_figure_path.exists(), required_figure_path)
print('result exists:', required_result_path.exists(), required_result_path)

figure exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_statistical_summary.png
result exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_result.csv

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.01583664958040238,
  "p_value": 0.874251497005988,
  "test_method": "within-trace H3K27ac marker permutation test (500 permutations); supplementary paired sign-flip (500)",
  "effect_size": -0.014128201304822752,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/h3k27ac-high-loci-resist-spurious-compaction-cal-b54d7cca89_result.csv",
  "notes": [
    "Adjacent distances are assigned to the upstream genomic bin after sorting by start.",
    "No all-vs-all distance matrix was computed; analysis is linear in spots plus bounded permutations."
  ]
}

Auto-discovery idea: H3K27ac-high loci resist spurious compaction calls after marker permutation¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶