"""Notebook scaffolding for auto-discovery idea exploration."""
from __future__ import annotations
import base64
import json
import os
import sys
import traceback
from contextlib import redirect_stderr, redirect_stdout
from io import BytesIO
from io import StringIO
from pathlib import Path
from typing import Any, Mapping
from .evidence import structured_conclusion_markdown
from .ideas import DiscoveryIdea
[docs]
def create_exploration_notebook(
idea: DiscoveryIdea | Mapping[str, Any],
output_path: str | Path,
*,
h5cd_path: str | Path | None = None,
run_output_dir: str | Path | None = None,
analysis_code: str | None = None,
verification_code: str | None = None,
kernel_name: str = "python3",
) -> Path:
"""Create a standard exploration notebook for one idea.
The code agent is expected to edit and execute this notebook freely.
The scaffold only defines the audit trail and verification contract.
"""
if not isinstance(idea, DiscoveryIdea):
idea = DiscoveryIdea.from_dict(idea)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
h5cd_repr = "None" if h5cd_path is None else repr(str(h5cd_path))
run_output_repr = "Path.cwd()" if run_output_dir is None else f"Path({str(run_output_dir)!r})"
cells = [
_markdown_cell(
_idea_markdown_cell(idea),
cell_id="idea_metadata",
),
_code_cell(
"from pathlib import Path\n"
"import json\n"
"import os\n"
"os.environ.setdefault('MPLBACKEND', 'Agg')\n"
"import numpy as np\n"
"import pandas as pd\n"
"import matplotlib\n"
"matplotlib.use('Agg', force=True)\n"
"import matplotlib.pyplot as plt\n"
"from uchrom import ChromData\n"
"from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema\n\n"
f"IDEA = DiscoveryIdea.from_dict({idea.to_dict()!r})\n"
f"H5CD_PATH = {h5cd_repr}\n"
f"RUN_OUTPUT_DIR = {run_output_repr}\n"
"RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n"
"cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None\n"
"schema = cdata.discovery_schema if cdata is not None else None\n"
"adata = cdata.linked_adata if cdata is not None else None\n"
"print(IDEA.idea_id)\n"
"if cdata is not None:\n"
" print(cdata)\n"
" print(cdata.describe_for_agent(max_items=20))\n",
cell_id="setup",
),
_markdown_cell("## Required data checks", cell_id="required_data_checks"),
_code_cell(
"review = review_idea_against_schema(IDEA, schema) if schema is not None else None\n"
"print(None if review is None else review.to_dict())\n"
"assert review is None or review.accepted, review.to_dict()\n",
cell_id="schema_review",
),
_markdown_cell(
"## Exploration\n\nThe code agent can freely add cells below this point.",
cell_id="exploration",
),
_code_cell(analysis_code or _default_analysis_code(), cell_id="exploration_code"),
_markdown_cell(
"## Runner verification summary\n\n"
"This scaffolded section is generated by U-Chrom. The notebook agent "
"executes it after exploration, and the runner re-executes it during "
"final verification.",
cell_id="verification_summary",
),
_code_cell(
verification_code or _default_verification_code(),
cell_id="verification",
),
]
nb = {
"cells": cells,
"metadata": {
"kernelspec": {
"display_name": kernel_name,
"language": "python",
"name": kernel_name,
},
"language_info": {"name": "python"},
"uchrom_auto_discovery": {
"idea_id": idea.idea_id,
"idea_title": idea.idea_title,
"h5cd_path": None if h5cd_path is None else str(h5cd_path),
},
},
"nbformat": 4,
"nbformat_minor": 5,
}
output_path.write_text(json.dumps(nb, indent=2))
return output_path
[docs]
def execute_notebook_python(
notebook_path: str | Path,
*,
stop_on_error: bool = False,
) -> dict[str, Any]:
"""Execute Python code cells in a notebook JSON file.
This lightweight executor is intentionally small: it keeps a shared
Python namespace across code cells, captures stdout/stderr/error text,
writes outputs back into the notebook, and returns the final namespace
entries commonly used by the auto-discovery runner. It is meant for
deterministic smoke tests; Pantheon/Jupyter can still execute the same
notebooks in richer interactive runs.
"""
notebook_path = Path(notebook_path)
nb = json.loads(notebook_path.read_text())
_configure_matplotlib_backend()
namespace: dict[str, Any] = {
"__name__": "__uchrom_auto_discovery_notebook__",
"__file__": str(notebook_path),
"display": _display,
}
ok = True
errors: list[dict[str, str]] = []
execution_count = 0
for cell in nb.get("cells", []):
if cell.get("cell_type") != "code":
continue
execution_count += 1
cell["execution_count"] = execution_count
source = "".join(cell.get("source", []))
before_figures = _current_matplotlib_figures()
stdout = StringIO()
stderr = StringIO()
outputs = []
try:
with redirect_stdout(stdout), redirect_stderr(stderr):
exec(compile(source, str(notebook_path), "exec"), namespace)
except Exception as exc: # pragma: no cover - exercised by runner failures
ok = False
tb = traceback.format_exc()
errors.append({"type": type(exc).__name__, "message": str(exc), "traceback": tb})
outputs.append({
"output_type": "error",
"ename": type(exc).__name__,
"evalue": str(exc),
"traceback": tb.splitlines(),
})
if stop_on_error:
cell["outputs"] = outputs
break
if stdout.getvalue():
outputs.append({
"output_type": "stream",
"name": "stdout",
"text": stdout.getvalue().splitlines(keepends=True),
})
if stderr.getvalue():
outputs.append({
"output_type": "stream",
"name": "stderr",
"text": stderr.getvalue().splitlines(keepends=True),
})
outputs.extend(_capture_new_matplotlib_figures(before_figures))
cell["outputs"] = outputs
notebook_path.write_text(json.dumps(nb, indent=2, default=str))
return {
"ok": ok,
"errors": errors,
"verification": namespace.get("verification"),
"result_table": namespace.get("result_table"),
"namespace_keys": sorted(k for k in namespace if not k.startswith("__")),
}
[docs]
def upsert_structured_conclusion(
notebook_path: str | Path,
idea: DiscoveryIdea | Mapping[str, Any],
verification: Mapping[str, Any] | None,
) -> Path:
"""Insert or replace the notebook's final interpretation with standard text."""
notebook_path = Path(notebook_path)
nb = json.loads(notebook_path.read_text())
source = structured_conclusion_markdown(idea, verification).splitlines(keepends=True)
cells = nb.get("cells", [])
target_index = None
for idx, cell in enumerate(cells):
if cell.get("id") == "structured_conclusion":
target_index = idx
break
if target_index is None:
for idx, cell in enumerate(cells):
if cell.get("cell_type") != "markdown":
continue
text = "".join(cell.get("source", [])).strip().lower()
if text.startswith("## final interpretation") or text.startswith("# final interpretation"):
target_index = idx
break
cell = {
"cell_type": "markdown",
"id": "structured_conclusion",
"metadata": {"generated_by": "uchrom_auto_discovery"},
"source": source,
}
if target_index is not None:
cells[target_index] = cell
else:
insert_at = len(cells)
for idx, existing in enumerate(cells):
if existing.get("id") == "verification":
insert_at = idx + 1
break
cells.insert(insert_at, cell)
notebook_path.write_text(json.dumps(nb, indent=2, default=str))
return notebook_path
def _configure_matplotlib_backend() -> None:
"""Use a non-interactive matplotlib backend for batch notebook execution."""
os.environ.setdefault("MPLBACKEND", "Agg")
try:
import matplotlib
matplotlib.use("Agg", force=True)
except Exception:
return
def _display(*objects: Any, **_: Any) -> None:
"""Small IPython ``display`` stand-in for deterministic smoke execution."""
for obj in objects:
print(repr(obj))
def _current_matplotlib_figures() -> set[int]:
plt = sys.modules.get("matplotlib.pyplot")
if plt is None or not hasattr(plt, "get_fignums"):
return set()
try:
return set(plt.get_fignums())
except Exception:
return set()
def _capture_new_matplotlib_figures(before_figures: set[int]) -> list[dict[str, Any]]:
plt = sys.modules.get("matplotlib.pyplot")
if plt is None or not hasattr(plt, "get_fignums"):
return []
outputs: list[dict[str, Any]] = []
try:
new_figures = [num for num in plt.get_fignums() if num not in before_figures]
for num in new_figures:
fig = plt.figure(num)
buf = BytesIO()
fig.savefig(buf, format="png", bbox_inches="tight", dpi=140)
data = base64.b64encode(buf.getvalue()).decode("ascii")
outputs.append({
"output_type": "display_data",
"data": {"image/png": data, "text/plain": ["<Figure size>"]},
"metadata": {},
})
plt.close(fig)
except Exception:
return outputs
return outputs
def _markdown_cell(source: str, *, cell_id: str | None = None) -> dict[str, Any]:
return {
"cell_type": "markdown",
**({"id": cell_id} if cell_id else {}),
"metadata": {},
"source": source.splitlines(keepends=True),
}
[docs]
def idea_to_markdown(idea: DiscoveryIdea | Mapping[str, Any]) -> str:
"""Return a readable Markdown brief for a discovery idea."""
if not isinstance(idea, DiscoveryIdea):
idea = DiscoveryIdea.from_dict(idea)
if idea.idea_markdown.strip():
return idea.idea_markdown.strip()
return "\n\n".join([
f"# Auto-discovery idea: {idea.idea_title}",
f"**Hypothesis.** {idea.biological_hypothesis}",
f"**Computable parameter.** {idea.computable_parameter}",
f"**Analysis plan.** {idea.analysis_plan}",
f"**Expected direction.** {idea.expected_direction or 'Not specified.'}",
"**Modalities.** " + ", ".join(idea.modalities),
"**Cell types.** " + (", ".join(idea.cell_types) if idea.cell_types else "Any compatible cell type"),
"**Required fields.** " + ", ".join(f"`{field}`" for field in idea.required_fields),
"**Validation checks.** " + ", ".join(f"`{check}`" for check in idea.validation_checks),
]).strip()
def _idea_markdown_cell(idea: DiscoveryIdea) -> str:
markdown = idea_to_markdown(idea)
title = f"# Auto-discovery idea: {idea.idea_title}"
if idea.idea_title.lower() in markdown[:300].lower():
return markdown
return f"{title}\n\n{markdown}"
def _code_cell(source: str, *, cell_id: str | None = None) -> dict[str, Any]:
return {
"cell_type": "code",
**({"id": cell_id} if cell_id else {}),
"execution_count": None,
"metadata": {},
"outputs": [],
"source": source.splitlines(keepends=True),
}
def _default_analysis_code() -> str:
return (
"# Free-form analysis area.\n"
"# Suggested outputs: result_table, figures, and verification metrics.\n"
"result_table = None\n"
"figures = []\n"
)
def _default_verification_code() -> str:
return (
"verification = {\n"
" 'idea_id': IDEA.idea_id,\n"
" 'accepted_by_schema_review': None if review is None else review.accepted,\n"
" 'checks': {check: 'not_run' for check in IDEA.validation_checks},\n"
" 'status': 'draft',\n"
" 'notes': [],\n"
"}\n"
"print(json.dumps(verification, indent=2))\n"
)