# Ensure relative data paths resolve from the workspace root, not the notebooks/ folder.
import os
from pathlib import Path
WORKSPACE_ROOT = Path('/Users/weizexu/Projects/U-Chrom')
os.chdir(WORKSPACE_ROOT)
print('cwd:', Path.cwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Gabra6 expression links to elongating RNA polymerase chromatin signal', 'biological_hypothesis': 'Higher Gabra6 RNA expression is associated with increased chromatin-associated RNAPIISer2-P signal, indicating a link between gene-expression state and active transcriptional elongation marks.', 'computable_parameter': 'Spearman rho between per-cell Gabra6 expression and per-cell mean tracks.RNAPIISer2-P over all spots.', 'analysis_plan': 'Align linked_adata.X to cells using the linked cell IDs, extract linked_adata.var.Gabra6 expression, and compute per-cell averages of tracks.RNAPIISer2-P using spots.cell_id. The sole discovery parameter is the Spearman correlation between Gabra6 expression and mean RNAPIISer2-P across cells. Report the p-value, rerun with identical deterministic grouping, and compare the observed rho with fixed-seed permutations of Gabra6 cell labels.', 'modalities': ['if_tracks', 'cell_metadata', 'rna_expression'], 'idea_markdown': "### Rationale\nGabra6 is present in the RNA matrix and can be tested against a direct transcription-associated IF mark, RNAPIISer2-P.\n\n### Data used\nUse linked Gabra6 expression, spot-level RNAPIISer2-P track intensity, spot-to-cell assignments, and cell type metadata.\n\n### Analysis sketch\nCompute each cell's mean RNAPIISer2-P signal over chromatin spots, then test whether this cell-level transcriptional elongation signal increases with Gabra6 expression.\n\n### Expected result\nA positive association would suggest that Gabra6-high cells have more chromatin-associated elongating polymerase signal.\n\n### Validation checks\nVerify field existence, cell and spot counts, finite values, Spearman p-value, runtime, deterministic rerun, and a shuffled-expression negative control.", 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['spots.cell_id', 'tracks.RNAPIISer2-P', 'cells.cell_type', 'linked_adata.X', 'linked_adata.var.Gabra6'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3', 'minimum_spot_or_trace_count_per_cell_for_RNAPIISer2-P_mean', 'finite_numeric_output', 'statistical_hypothesis_test_spearman_with_p_value', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation_by_shuffling_Gabra6_expression_across_cells'], 'expected_direction': 'Positive correlation: higher Gabra6 expression should correspond to higher mean RNAPIISer2-P signal.', 'complexity': 2, 'idea_id': 'gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection: field presence, alignment, finite coverage.
import numpy as np
import pandas as pd
from scipy import sparse

track_name = 'RNAPIISer2-P'
gene_name = 'Gabra6'

spots_df = cdata.spots.copy()
cells_df = cdata.cells.copy()
adata = cdata.linked_adata

track_values = np.asarray(cdata.tracks[track_name], dtype=float)
cell_ids = np.asarray(spots_df['cell_id'])

# Linked AnnData expression extraction.
gene_idx = list(adata.var_names).index(gene_name) if gene_name in list(adata.var_names) else None
expr_vec = adata.X[:, gene_idx]
if sparse.issparse(expr_vec):
    expr_vec = expr_vec.toarray().ravel()
else:
    expr_vec = np.asarray(expr_vec).ravel()

cell_id_preview = pd.DataFrame({
    'cells_index': list(cells_df.index.astype(str)),
    'linked_obs': list(pd.Index(adata.obs_names).astype(str)),
    'cell_type': list(cells_df['cell_type'].astype(str)),
    'Gabra6_expression': expr_vec,
})
spot_counts = pd.Series(cell_ids).value_counts().sort_index()
inspection_summary = {
    'n_cells': int(cdata.n_cells),
    'n_spots': int(cdata.n_spots),
    'n_tracks_values': int(track_values.shape[0]),
    'track_present': bool(track_name in cdata.tracks),
    'gene_present': bool(gene_name in adata.var_names),
    'finite_track_fraction': float(np.isfinite(track_values).mean()),
    'finite_gabra6_fraction': float(np.isfinite(expr_vec).mean()),
    'cell_type_counts': cells_df['cell_type'].value_counts().to_dict(),
    'min_spots_per_cell': int(spot_counts.min()),
    'max_spots_per_cell': int(spot_counts.max()),
    'cell_index_matches_linked_obs': bool(list(cells_df.index.astype(str)) == list(pd.Index(adata.obs_names).astype(str))),
}
print(json.dumps(inspection_summary, indent=2))
display(cell_id_preview)
display(spot_counts.rename('spot_count').to_frame().head(12))

{
  "n_cells": 9,
  "n_spots": 56036,
  "n_tracks_values": 56036,
  "track_present": true,
  "gene_present": true,
  "finite_track_fraction": 1.0,
  "finite_gabra6_fraction": 1.0,
  "cell_type_counts": {
    "Granule": 3,
    "Bergmann": 3,
    "Purkinje": 3
  },
  "min_spots_per_cell": 3220,
  "max_spots_per_cell": 11659,
  "cell_index_matches_linked_obs": true
}
  cells_index linked_obs cell_type  Gabra6_expression
0      1_0_42     1_0_42   Granule                7.0
1      1_0_47     1_0_47   Granule               13.0
2      1_0_69     1_0_69   Granule                5.0
3      1_0_34     1_0_34  Bergmann                2.0
4      1_0_61     1_0_61  Bergmann                4.0
5      1_0_63     1_0_63  Bergmann                2.0
6      1_0_26     1_0_26  Purkinje                4.0
7      1_0_37     1_0_37  Purkinje                2.0
8     1_0_116    1_0_116  Purkinje                5.0
         spot_count
1_0_116       11659
1_0_26         4225
1_0_34         3932
1_0_37         5238
1_0_42         4183
1_0_47         4682
1_0_61        11283
1_0_63         7614
1_0_69         3220

from pathlib import Path
print('cwd', Path.cwd())
print('RUN_OUTPUT_DIR', RUN_OUTPUT_DIR, 'absolute?', RUN_OUTPUT_DIR.is_absolute(), 'resolved', RUN_OUTPUT_DIR.resolve())
print('expected exists?', Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg').resolve().exists())
print('root misplaced fig exists?', Path('gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_statistical_summary.png').exists())

cwd /Users/weizexu/Projects/U-Chrom
RUN_OUTPUT_DIR tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg absolute? False resolved /Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg
expected exists? True
root misplaced fig exists? False

# Main analysis: Gabra6 expression versus per-cell mean RNAPIISer2-P.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt

import json
import time
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import sparse, stats
from IPython.display import display, Image

start_time = time.time()
rng = np.random.default_rng(20250308)
track_name = 'RNAPIISer2-P'
gene_name = 'Gabra6'
result_path = RUN_OUTPUT_DIR / 'gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_result.csv'
figure_path = RUN_OUTPUT_DIR / 'gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_statistical_summary.png'

# Extract expression in linked cell order.
adata = cdata.linked_adata
gene_idx = list(adata.var_names).index(gene_name)
expr = adata.X[:, gene_idx]
expr = expr.toarray().ravel() if sparse.issparse(expr) else np.asarray(expr).ravel()
linked_cell_ids = pd.Index(adata.obs_names).astype(str)

# Compute per-cell mean RNAPIISer2-P across all assigned chromatin spots.
spots_df = cdata.spots.copy()
track_values = np.asarray(cdata.tracks[track_name], dtype=float)
spot_table = pd.DataFrame({'cell_id': spots_df['cell_id'].astype(str).to_numpy(), track_name: track_values})
per_cell_track = spot_table.groupby('cell_id', sort=True)[track_name].agg(['mean', 'count']).rename(
    columns={'mean': 'mean_RNAPIISer2P', 'count': 'n_spots'}
)

cell_types = cdata.cells['cell_type'].astype(str).reindex(linked_cell_ids)
cell_df = pd.DataFrame({
    'cell_id': linked_cell_ids,
    'cell_type': cell_types.to_numpy(),
    'Gabra6_expression': expr,
}).set_index('cell_id')
cell_df = cell_df.join(per_cell_track, how='left')
cell_df['finite_pair'] = np.isfinite(cell_df['Gabra6_expression']) & np.isfinite(cell_df['mean_RNAPIISer2P'])
analysis_df = cell_df.loc[cell_df['finite_pair']].copy()

n = int(len(analysis_df))
min_spots = int(analysis_df['n_spots'].min()) if n else 0
null_hypothesis = 'Across cells, Gabra6 expression is not monotonically associated with mean chromatin-associated RNAPIISer2-P signal (Spearman rho = 0 / label exchangeability).'
alternative_hypothesis = 'Across cells, higher Gabra6 expression is associated with higher mean chromatin-associated RNAPIISer2-P signal (positive monotonic association).'

if n >= 3 and analysis_df['Gabra6_expression'].nunique() >= 2 and analysis_df['mean_RNAPIISer2P'].nunique() >= 2:
    rho, spearman_p = stats.spearmanr(analysis_df['Gabra6_expression'], analysis_df['mean_RNAPIISer2P'], alternative='greater')
    rho = float(rho)
    spearman_p = float(spearman_p)
    n_perm = 1000
    permuted_rhos = np.empty(n_perm, dtype=float)
    y = analysis_df['mean_RNAPIISer2P'].to_numpy(float)
    x = analysis_df['Gabra6_expression'].to_numpy(float)
    for i in range(n_perm):
        x_perm = rng.permutation(x)
        permuted_rhos[i] = stats.spearmanr(x_perm, y).statistic
    # One-sided positive permutation p with +1 correction.
    perm_p = float((np.sum(permuted_rhos >= rho) + 1) / (n_perm + 1))
    observed_statistic = rho
    effect_size = rho
    p_value = spearman_p
    hypothesis_test_status = 'pass'
    test_method = 'one-sided Spearman rank correlation with fixed-seed label permutation control'
else:
    rho = np.nan
    spearman_p = np.nan
    n_perm = 0
    permuted_rhos = np.array([], dtype=float)
    perm_p = np.nan
    observed_statistic = float(analysis_df['mean_RNAPIISer2P'].mean() - analysis_df['mean_RNAPIISer2P'].median()) if n else 0.0
    effect_size = float(observed_statistic)
    p_value = 1.0
    hypothesis_test_status = 'insufficient_data'
    test_method = 'Spearman rank correlation not run: insufficient finite variation or n<3'

# Deterministic rerun of grouping and statistic.
repeat_means = spot_table.groupby('cell_id', sort=True)[track_name].mean().reindex(analysis_df.index).to_numpy(float)
deterministic_grouping = bool(np.allclose(repeat_means, analysis_df['mean_RNAPIISer2P'].to_numpy(float), equal_nan=True))
if hypothesis_test_status == 'pass':
    repeat_rho = float(stats.spearmanr(analysis_df['Gabra6_expression'], repeat_means, alternative='greater').statistic)
    deterministic_rerun = bool(np.isclose(repeat_rho, observed_statistic))
else:
    repeat_rho = np.nan
    deterministic_rerun = deterministic_grouping

# Result table: per-cell values plus global hypothesis-test fields required by verifier.
result_table = analysis_df.reset_index().rename(columns={'mean_RNAPIISer2P': 'mean_RNAPIISer2P_track'})
result_table['observed_statistic'] = observed_statistic
result_table['effect_size'] = effect_size
result_table['p_value'] = p_value
result_table['permutation_p_value'] = perm_p
result_table['test_method'] = test_method
result_table['expected_direction'] = 'positive'
result_table.to_csv(result_path, index=False)

# Statistical figure: observed cell scatter plus permutation-null evidence.
plt.style.use('default')
fig, axes = plt.subplots(1, 2, figsize=(10.5, 4.2), facecolor='white')
ax = axes[0]
colors = {'Granule': '#1f77b4', 'Bergmann': '#ff7f0e', 'Purkinje': '#2ca02c'}
for cell_type, sub in analysis_df.groupby('cell_type'):
    ax.scatter(sub['Gabra6_expression'], sub['mean_RNAPIISer2P'], s=70, label=f'{cell_type} (n={len(sub)})',
               edgecolor='black', linewidth=0.5, color=colors.get(cell_type, None), alpha=0.9)
if n >= 2:
    slope, intercept = np.polyfit(analysis_df['Gabra6_expression'].to_numpy(float), analysis_df['mean_RNAPIISer2P'].to_numpy(float), deg=1)
    xx = np.linspace(float(analysis_df['Gabra6_expression'].min()), float(analysis_df['Gabra6_expression'].max()), 100)
    ax.plot(xx, slope * xx + intercept, color='black', linestyle='--', linewidth=1.2, label='linear guide')
ax.set_xlabel('Gabra6 expression (linked_adata counts)')
ax.set_ylabel('Mean RNAPIISer2-P track intensity per cell')
ax.set_title('Cell-level association')
ax.legend(frameon=False, fontsize=8)
ax.grid(True, alpha=0.25)

ax2 = axes[1]
if permuted_rhos.size:
    ax2.hist(permuted_rhos, bins=21, color='#bdbdbd', edgecolor='white', label=f'permuted labels (n={len(permuted_rhos)})')
    ax2.axvline(observed_statistic, color='#d62728', linewidth=2, label=f'observed rho={observed_statistic:.3f}')
    ax2.set_xlabel('Spearman rho under Gabra6 label permutation')
    ax2.set_ylabel('Permutation count')
else:
    ax2.text(0.5, 0.5, 'Insufficient data for permutation null', ha='center', va='center', transform=ax2.transAxes)
    ax2.set_xlabel('Spearman rho')
    ax2.set_ylabel('Count')
annotation = f"{test_method}\nn={n} cells; min spots/cell={min_spots}\nSpearman p={p_value:.3g}; perm p={perm_p:.3g}\neffect size rho={effect_size:.3f}"
ax2.text(0.02, 0.98, annotation, transform=ax2.transAxes, va='top', ha='left', fontsize=8,
         bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='0.7', alpha=0.95))
ax2.set_title('Permutation negative-control evidence')
ax2.legend(frameon=False, fontsize=8, loc='lower right')
ax2.grid(True, alpha=0.25)
fig.suptitle('Gabra6 expression vs chromatin-associated RNAPIISer2-P elongation signal', fontsize=12)
fig.tight_layout(rect=[0, 0, 1, 0.94])
fig.savefig(figure_path, dpi=200, bbox_inches='tight')
plt.show()
display(Image(filename=str(figure_path)))

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'Spearman rho: Gabra6 expression vs per-cell mean RNAPIISer2-P',
    'parameter_value': float(observed_statistic),
    'observed_statistic': float(observed_statistic),
    'effect_size': float(effect_size),
    'p_value': float(p_value),
    'permutation_p_value': float(perm_p) if np.isfinite(perm_p) else None,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_selected_cells': n,
    'n_rows': n,
    'min_spots_per_cell': min_spots,
    'finite_pair_count': n,
    'required_fields_exist': True,
    'cell_id_alignment': bool(list(cdata.cells.index.astype(str)) == list(pd.Index(adata.obs_names).astype(str))),
    'deterministic_grouping': deterministic_grouping,
    'deterministic_rerun': deterministic_rerun,
    'negative_control_or_permutation': bool(permuted_rhos.size > 0),
    'runtime_seconds': float(time.time() - start_time),
    'result_path': str(result_path),
    'statistical_figure_path': str(figure_path),
    'notes': [
        'Small n=9 dataset; result is exploratory and should not be overinterpreted.',
        'Permutation control shuffled Gabra6 expression labels across aligned cells with fixed RNG seed.'
    ],
}
print(json.dumps(analysis_summary, indent=2))
display(result_table)

<IPython.core.display.Image object>
{
  "idea_id": "gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223",
  "parameter_name": "Spearman rho: Gabra6 expression vs per-cell mean RNAPIISer2-P",
  "parameter_value": -0.29924368602483664,
  "observed_statistic": -0.29924368602483664,
  "effect_size": -0.29924368602483664,
  "p_value": 0.7829683942955621,
  "permutation_p_value": 0.7952047952047953,
  "test_method": "one-sided Spearman rank correlation with fixed-seed label permutation control",
  "null_hypothesis": "Across cells, Gabra6 expression is not monotonically associated with mean chromatin-associated RNAPIISer2-P signal (Spearman rho = 0 / label exchangeability).",
  "alternative_hypothesis": "Across cells, higher Gabra6 expression is associated with higher mean chromatin-associated RNAPIISer2-P signal (positive monotonic association).",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 9,
  "n_rows": 9,
  "min_spots_per_cell": 3220,
  "finite_pair_count": 9,
  "required_fields_exist": true,
  "cell_id_alignment": true,
  "deterministic_grouping": true,
  "deterministic_rerun": true,
  "negative_control_or_permutation": true,
  "runtime_seconds": 0.1631169319152832,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_statistical_summary.png",
  "notes": [
    "Small n=9 dataset; result is exploratory and should not be overinterpreted.",
    "Permutation control shuffled Gabra6 expression labels across aligned cells with fixed RNG seed."
  ]
}
   cell_id  ... expected_direction
0   1_0_42  ...           positive
1   1_0_47  ...           positive
2   1_0_69  ...           positive
3   1_0_34  ...           positive
4   1_0_61  ...           positive
5   1_0_63  ...           positive
6   1_0_26  ...           positive
7   1_0_37  ...           positive
8  1_0_116  ...           positive

[9 rows x 12 columns]

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223.ipynb:140: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  "print(IDEA.idea_id)\n",

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count_n>=9_and_each_listed_cell_type_n>=3": "not_run",
    "minimum_spot_or_trace_count_per_cell_for_RNAPIISer2-P_mean": "not_run",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test_spearman_with_p_value": "not_run",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation_by_shuffling_Gabra6_expression_across_cells": "not_implemented",
    "statistical_hypothesis_test": "pass"
  },
  "parameter_value": -0.29924368602483664,
  "p_value": 0.7829683942955621,
  "test_method": "one-sided Spearman rank correlation with fixed-seed label permutation control",
  "effect_size": -0.29924368602483664,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/gabra6-expression-links-to-elongating-rna-polyme-eef7dd1223_result.csv",
  "notes": [
    "Small n=9 dataset; result is exploratory and should not be overinterpreted.",
    "Permutation control shuffled Gabra6 expression labels across aligned cells with fixed RNG seed."
  ]
}

Auto-discovery idea: Gabra6 expression links to elongating RNA polymerase chromatin signal¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶