import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd', os.getcwd())

cwd /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Pcp2-linked active chromatin hub proximity across cell types', 'biological_hypothesis': 'Pcp2-high cells show tighter 3D proximity among H3K27ac- and BRD4-high active chromatin spots.', 'computable_parameter': 'Pcp2_active_hub_Spearman_rho = Spearman correlation across cells between linked_adata.X Pcp2 expression and per-cell median within-trace pairwise 3D distance among spots jointly in the top quartile for tracks.H3K27ac and tracks.BRD4.', 'analysis_plan': 'Align cells to linked_adata observations, extract Pcp2 expression, identify per-cell spots jointly high for H3K27ac and BRD4, compute within-trace pairwise Euclidean distances and aggregate to one median active-hub distance per cell, then compute Spearman rho and an exact or Monte Carlo p-value by permuting Pcp2 expression across cells.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata', 'rna_expression'], 'idea_markdown': '### Rationale\nCells with higher Pcp2 expression, expected to include Purkinje-like identity, may organize H3K27ac/BRD4-rich active chromatin into tighter 3D hubs.\n\n### Data used\nUse linked RNA expression for Pcp2 together with traced chromatin coordinates and IF tracks for H3K27ac and BRD4.\n\n### Analysis sketch\nFor each cell, compute the median within-trace 3D distance among spots jointly high for H3K27ac and BRD4, then correlate that cell-level distance with Pcp2 expression.\n\n### Expected result\nHigher Pcp2 expression is expected to associate with smaller active-hub distances if Purkinje-like active chromatin is spatially concentrated.\n\n### Validation checks\nValidate linked cell alignment, sufficient cells, enough joint-high active spots per cell, finite correlation, an exact permutation p-value, runtime budget, deterministic rerun, and shuffled-expression control.', 'cell_types': ['Purkinje', 'Granule', 'Bergmann'], 'required_fields': ['coords', 'spots.cell_id', 'spots.trace_id', 'cells.cell_type', 'tracks.H3K27ac', 'tracks.BRD4', 'linked_adata.X', 'linked_adata.var.Pcp2'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'Negative Spearman rho: higher Pcp2 expression should correspond to smaller H3K27ac/BRD4 active-hub distances.', 'complexity': 5, 'idea_id': 'pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection before the main analysis
import numpy as np
import pandas as pd
from scipy import sparse

print('spots shape:', cdata.spots.shape)
print('spot columns:', list(cdata.spots.columns))
print('cells shape:', cdata.cells.shape)
print('cell types:', cdata.cells['cell_type'].value_counts().to_dict())
print('coords shape:', np.asarray(cdata.coords).shape, 'finite fraction:', float(np.isfinite(np.asarray(cdata.coords)).mean()))
print('tracks shape:', cdata.tracks.shape if hasattr(cdata.tracks, 'shape') else type(cdata.tracks))
print('required tracks present:', {k: k in cdata.tracks.columns for k in ['H3K27ac','BRD4']})
print('linked adata shape:', adata.shape, 'aligned obs:', list(map(str, adata.obs_names)) == list(map(str, cdata.cells.index)))
var_names = list(map(str, adata.var_names))
print('Pcp2 in linked genes:', 'Pcp2' in var_names)
if 'Pcp2' in var_names:
    x = adata[:, ['Pcp2']].X
    pcp2 = np.asarray(x.toarray() if sparse.issparse(x) else x).ravel()
    print('Pcp2 preview:', pd.Series(pcp2, index=adata.obs_names).round(4).to_dict())
print(cdata.spots[['cell_id','trace_id','chrom']].head())
print(cdata.tracks[['H3K27ac','BRD4']].describe().round(3))

spots shape: (56036, 6)
spot columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells shape: (9, 10)
cell types: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
coords shape: (56036, 3) finite fraction: 1.0
tracks shape: (56036, 62)
required tracks present: {'H3K27ac': True, 'BRD4': True}
linked adata shape: (9, 60) aligned obs: True
Pcp2 in linked genes: True
Pcp2 preview: {'1_0_42': 0.0, '1_0_47': 0.0, '1_0_69': 1.0, '1_0_34': 5.0, '1_0_61': 11.0, '1_0_63': 26.0, '1_0_26': 94.0, '1_0_37': 126.0, '1_0_116': 376.0}
  cell_id         trace_id  chrom
0  1_0_61  1_0_61_chr14_a2  chr14
1  1_0_61   1_0_61_chr2_a2   chr2
2  1_0_61  1_0_61_chr14_a1  chr14
3  1_0_61   1_0_61_chr2_a2   chr2
4  1_0_61  1_0_61_chr14_a1  chr14
         H3K27ac       BRD4
count  56036.000  56036.000
mean       0.374      0.034
std        0.956      0.949
min       -1.564     -2.047
25%       -0.346     -0.589
50%        0.256     -0.155
75%        0.944      0.425
max        8.550     13.093

# Main executable analysis: Pcp2 expression vs active chromatin hub proximity
import os
os.environ.setdefault("MPLBACKEND", "Agg")
import matplotlib
matplotlib.use("Agg", force=True)
import matplotlib.pyplot as plt
from IPython.display import display, Image
from scipy import sparse
from scipy.stats import spearmanr
import numpy as np
import pandas as pd
import json

rng = np.random.default_rng(20250220)
result_path = RUN_OUTPUT_DIR / "pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv"
figure_path = RUN_OUTPUT_DIR / "pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png"

# Extract linked Pcp2 expression and confirm row alignment to cdata.cells.
if adata is None:
    raise RuntimeError("linked_adata is unavailable")
cell_ids = list(map(str, cdata.cells.index))
adata_obs = list(map(str, adata.obs_names))
if cell_ids != adata_obs:
    raise RuntimeError("cdata.cells index is not aligned to linked_adata.obs_names")
if "Pcp2" not in list(map(str, adata.var_names)):
    raise RuntimeError("Pcp2 is absent from linked_adata.var_names")
pcp2_x = adata[:, ["Pcp2"]].X
pcp2 = np.asarray(pcp2_x.toarray() if sparse.issparse(pcp2_x) else pcp2_x, dtype=float).ravel()
pcp2_by_cell = pd.Series(pcp2, index=cell_ids, name="Pcp2_expression")

# Jointly high active spots: global top quartile for both H3K27ac and BRD4 among finite values.
tracks = cdata.tracks[["H3K27ac", "BRD4"]].astype(float)
h3_thr = float(np.nanquantile(tracks["H3K27ac"], 0.75))
brd4_thr = float(np.nanquantile(tracks["BRD4"], 0.75))
active_mask = (tracks["H3K27ac"] >= h3_thr) & (tracks["BRD4"] >= brd4_thr)

spots = cdata.spots[["cell_id", "trace_id", "chrom"]].copy()
spots["cell_id"] = spots["cell_id"].astype(str)
spots["trace_id"] = spots["trace_id"].astype(str)
spots["active_joint_high"] = active_mask.to_numpy()
coords = np.asarray(cdata.coords, dtype=float)

cell_records = []
cell_distance_vectors = {}
max_pairs_per_trace = 5000
for cell_id in cell_ids:
    active_idx = np.flatnonzero((spots["cell_id"].to_numpy() == cell_id) & spots["active_joint_high"].to_numpy())
    distances = []
    traces_used = 0
    for trace_id, group in spots.iloc[active_idx].groupby("trace_id", sort=False):
        idx = group.index.to_numpy()
        n = len(idx)
        if n < 2:
            continue
        xyz = coords[idx]
        if not np.isfinite(xyz).all():
            xyz = xyz[np.isfinite(xyz).all(axis=1)]
            n = len(xyz)
            if n < 2:
                continue
        traces_used += 1
        total_pairs = n * (n - 1) // 2
        if total_pairs <= max_pairs_per_trace:
            ii, jj = np.triu_indices(n, k=1)
        else:
            # Bounded random subsampling for pathological traces; deterministic local RNG.
            ii = rng.integers(0, n, size=max_pairs_per_trace)
            jj = rng.integers(0, n - 1, size=max_pairs_per_trace)
            jj = jj + (jj >= ii)
        d = np.linalg.norm(xyz[ii] - xyz[jj], axis=1)
        distances.append(d)
    if distances:
        all_d = np.concatenate(distances)
    else:
        all_d = np.array([], dtype=float)
    cell_distance_vectors[cell_id] = all_d
    cell_records.append({
        "cell_id": cell_id,
        "cell_type": str(cdata.cells.loc[cell_id, "cell_type"]),
        "Pcp2_expression": float(pcp2_by_cell.loc[cell_id]),
        "active_spot_count": int(len(active_idx)),
        "active_traces_with_pairs": int(traces_used),
        "pairwise_distance_count": int(len(all_d)),
        "median_active_hub_distance_um": float(np.nanmedian(all_d)) if len(all_d) else np.nan,
    })

cell_table = pd.DataFrame(cell_records)
test_df = cell_table[np.isfinite(cell_table["Pcp2_expression"]) & np.isfinite(cell_table["median_active_hub_distance_um"])].copy()

null_hypothesis = "Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance."
alternative_hypothesis = "Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho)."
test_method = "one-sided Spearman permutation test (1000 label shuffles)"

if len(test_df) >= 4 and test_df["Pcp2_expression"].nunique() >= 2 and test_df["median_active_hub_distance_um"].nunique() >= 2:
    observed_rho = float(spearmanr(test_df["Pcp2_expression"], test_df["median_active_hub_distance_um"]).statistic)
    n_perm = 1000
    null_rhos = np.empty(n_perm, dtype=float)
    y = test_df["median_active_hub_distance_um"].to_numpy(dtype=float)
    x = test_df["Pcp2_expression"].to_numpy(dtype=float)
    for i in range(n_perm):
        null_rhos[i] = float(spearmanr(rng.permutation(x), y).statistic)
    # One-sided expected-negative p-value; +1 correction keeps finite and reproducible.
    p_value = float((np.sum(null_rhos <= observed_rho) + 1) / (n_perm + 1))
    effect_size = observed_rho
    hypothesis_test_status = "pass"
    notes = []
else:
    observed_rho = 0.0
    null_rhos = np.array([0.0], dtype=float)
    p_value = 1.0
    effect_size = 0.0
    hypothesis_test_status = "insufficient_data"
    notes = ["Too few finite cells or insufficient variation for Spearman permutation test; returned finite placeholders."]

# Attach hypothesis-test fields to result table for verification and downstream audit.
result_table = cell_table.copy()
result_table["observed_statistic"] = observed_rho
result_table["effect_size"] = effect_size
result_table["p_value"] = p_value
result_table["test_method"] = test_method
result_table["null_hypothesis"] = null_hypothesis
result_table["alternative_hypothesis"] = alternative_hypothesis
result_table["h3k27ac_top_quartile_threshold"] = h3_thr
result_table["brd4_top_quartile_threshold"] = brd4_thr
result_table.to_csv(result_path, index=False)

analysis_summary = {
    "idea_id": IDEA.idea_id,
    "parameter_name": "Pcp2_active_hub_Spearman_rho",
    "parameter_value": float(observed_rho),
    "observed_statistic": float(observed_rho),
    "effect_size": float(effect_size),
    "p_value": float(p_value),
    "test_method": test_method,
    "null_hypothesis": null_hypothesis,
    "alternative_hypothesis": alternative_hypothesis,
    "hypothesis_test_status": hypothesis_test_status,
    "n_selected_cells": int(len(test_df)),
    "n_rows": int(len(result_table)),
    "n_permutations": int(len(null_rhos)),
    "active_spots_total": int(active_mask.sum()),
    "h3k27ac_top_quartile_threshold": h3_thr,
    "brd4_top_quartile_threshold": brd4_thr,
    "result_path": str(result_path),
    "statistical_figure_path": str(figure_path),
    "notes": notes,
}

# Statistical figure: cell-level group comparison plus null distribution and observed statistic.
plt.rcParams.update({"figure.facecolor": "white", "axes.facecolor": "white", "font.size": 10})
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.6), constrained_layout=True)
colors = {"Purkinje": "#1b9e77", "Granule": "#7570b3", "Bergmann": "#d95f02"}
for ctype, sub in test_df.groupby("cell_type"):
    axes[0].scatter(
        sub["Pcp2_expression"], sub["median_active_hub_distance_um"],
        s=58, alpha=0.9, label=f"{ctype} (n={len(sub)})", color=colors.get(ctype, "#555555"), edgecolor="black", linewidth=0.4,
    )
if len(test_df) >= 2:
    # Visual trend only; hypothesis test is Spearman/permutation.
    xs = test_df["Pcp2_expression"].to_numpy(dtype=float)
    ys = test_df["median_active_hub_distance_um"].to_numpy(dtype=float)
    coef = np.polyfit(xs, ys, deg=1)
    xline = np.linspace(xs.min(), xs.max(), 100)
    axes[0].plot(xline, np.polyval(coef, xline), color="black", linestyle="--", linewidth=1.2, label="linear visual guide")
axes[0].set_xlabel("linked Pcp2 expression (counts)")
axes[0].set_ylabel("median active-hub distance (um)")
axes[0].set_title("Cell-level active chromatin hub proximity")
axes[0].legend(frameon=False, fontsize=8)
axes[0].grid(alpha=0.2)

axes[1].hist(null_rhos, bins=30 if len(null_rhos) > 30 else 10, color="#bdbdbd", edgecolor="white", label="permuted Pcp2 labels")
axes[1].axvline(observed_rho, color="#d62728", linewidth=2.0, label=f"observed rho={observed_rho:.3f}")
axes[1].axvline(0, color="black", linewidth=1.0, linestyle=":")
axes[1].set_xlabel("Spearman rho under null")
axes[1].set_ylabel("permutation count")
axes[1].set_title("One-sided permutation evidence")
axes[1].legend(frameon=False, fontsize=8)
axes[1].text(
    0.03, 0.97,
    f"p={p_value:.3g}\neffect=rho={effect_size:.3f}\nn={len(test_df)} cells\n{test_method}",
    transform=axes[1].transAxes, va="top", ha="left",
    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", edgecolor="#cccccc", alpha=0.95),
    fontsize=8,
)
fig.suptitle("Pcp2-linked proximity of H3K27ac/BRD4-high active chromatin hubs", fontsize=12)
fig.savefig(figure_path, dpi=180, bbox_inches="tight")
display(fig)
plt.close(fig)
display(Image(filename=str(figure_path)))

print(result_table.round(4).to_string(index=False))
print(json.dumps(analysis_summary, indent=2))

Figure(1150x460)
<IPython.core.display.Image object>
cell_id cell_type  Pcp2_expression  active_spot_count  active_traces_with_pairs  pairwise_distance_count  median_active_hub_distance_um  observed_statistic  effect_size  p_value                                               test_method                                                                                                             null_hypothesis                                                                                                                               alternative_hypothesis  h3k27ac_top_quartile_threshold  brd4_top_quartile_threshold
 1_0_42   Granule              0.0                319                        20                     3668                         1.0795              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_47   Granule              0.0                332                        19                     4409                         1.4729              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_69   Granule              1.0                302                        20                     3454                         1.1619              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_34  Bergmann              5.0                135                        13                      908                         1.1530              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_61  Bergmann             11.0                736                        29                    14536                         1.7223              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_63  Bergmann             26.0                696                        24                    15417                         1.7177              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_26  Purkinje             94.0                389                        14                     9176                         1.9469              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
 1_0_37  Purkinje            126.0                513                        14                    14226                         2.3688              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
1_0_116  Purkinje            376.0                695                        27                    13758                         2.2717              0.8787       0.8787    0.999 one-sided Spearman permutation test (1000 label shuffles) Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance. Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).                           0.944                       0.4249
{
  "idea_id": "pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46",
  "parameter_name": "Pcp2_active_hub_Spearman_rho",
  "parameter_value": 0.8786687791935092,
  "observed_statistic": 0.8786687791935092,
  "effect_size": 0.8786687791935092,
  "p_value": 0.999000999000999,
  "test_method": "one-sided Spearman permutation test (1000 label shuffles)",
  "null_hypothesis": "Pcp2 expression is exchangeable across cells and is not negatively associated with median H3K27ac/BRD4 active-hub distance.",
  "alternative_hypothesis": "Cells with higher Pcp2 expression have smaller median within-trace distances among jointly H3K27ac-high and BRD4-high spots (negative Spearman rho).",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 9,
  "n_rows": 9,
  "n_permutations": 1000,
  "active_spots_total": 4117,
  "h3k27ac_top_quartile_threshold": 0.94403,
  "brd4_top_quartile_threshold": 0.4249,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png",
  "notes": []
}

# Ensure saved artifacts are present at the requested workspace-relative paths.
from pathlib import Path
import shutil, os
print('cwd for artifact check:', os.getcwd())
requested_dir = Path('/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
requested_dir.mkdir(parents=True, exist_ok=True)
requested_result = requested_dir / 'pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv'
requested_figure = requested_dir / 'pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png'
# Copy from accidental cwd-relative location if needed, then update summary paths.
for src, dst in [(Path('pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv'), requested_result),
                 (Path('pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png'), requested_figure)]:
    if src.exists() and src.resolve() != dst.resolve():
        shutil.copy2(src, dst)
if 'result_table' in globals():
    result_table.to_csv(requested_result, index=False)
if 'analysis_summary' in globals():
    analysis_summary['result_path'] = 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv'
    analysis_summary['statistical_figure_path'] = 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png'
print('result exists:', requested_result.exists(), requested_result)
print('figure exists:', requested_figure.exists(), requested_figure)

cwd for artifact check: /Users/weizexu/Projects/U-Chrom
result exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv
figure exists: True /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_statistical_summary.png

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": 0.8786687791935092,
  "p_value": 0.999000999000999,
  "test_method": "one-sided Spearman permutation test (1000 label shuffles)",
  "effect_size": 0.8786687791935092,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/pcp2-linked-active-chromatin-hub-proximity-acros-f62e7efc46_result.csv",
  "notes": []
}

Auto-discovery idea: Pcp2-linked active chromatin hub proximity across cell types¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶