Source code for uchrom.auto_discovery.notebooks

"""Notebook scaffolding for auto-discovery idea exploration."""

from __future__ import annotations

import base64
import json
import os
import sys
import traceback
from contextlib import redirect_stderr, redirect_stdout
from io import BytesIO
from io import StringIO
from pathlib import Path
from typing import Any, Mapping

from .evidence import structured_conclusion_markdown
from .ideas import DiscoveryIdea



[docs]
def create_exploration_notebook(
    idea: DiscoveryIdea | Mapping[str, Any],
    output_path: str | Path,
    *,
    h5cd_path: str | Path | None = None,
    run_output_dir: str | Path | None = None,
    analysis_code: str | None = None,
    verification_code: str | None = None,
    kernel_name: str = "python3",
) -> Path:
    """Create a standard exploration notebook for one idea.

    The code agent is expected to edit and execute this notebook freely.
    The scaffold only defines the audit trail and verification contract.
    """
    if not isinstance(idea, DiscoveryIdea):
        idea = DiscoveryIdea.from_dict(idea)
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    h5cd_repr = "None" if h5cd_path is None else repr(str(h5cd_path))
    run_output_repr = "Path.cwd()" if run_output_dir is None else f"Path({str(run_output_dir)!r})"

    cells = [
        _markdown_cell(
            _idea_markdown_cell(idea),
            cell_id="idea_metadata",
        ),
        _code_cell(
            "from pathlib import Path\n"
            "import json\n"
            "import os\n"
            "os.environ.setdefault('MPLBACKEND', 'Agg')\n"
            "import numpy as np\n"
            "import pandas as pd\n"
            "import matplotlib\n"
            "matplotlib.use('Agg', force=True)\n"
            "import matplotlib.pyplot as plt\n"
            "from uchrom import ChromData\n"
            "from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema\n\n"
            f"IDEA = DiscoveryIdea.from_dict({idea.to_dict()!r})\n"
            f"H5CD_PATH = {h5cd_repr}\n"
            f"RUN_OUTPUT_DIR = {run_output_repr}\n"
            "RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n"
            "cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None\n"
            "schema = cdata.discovery_schema if cdata is not None else None\n"
            "adata = cdata.linked_adata if cdata is not None else None\n"
            "print(IDEA.idea_id)\n"
            "if cdata is not None:\n"
            "    print(cdata)\n"
            "    print(cdata.describe_for_agent(max_items=20))\n",
            cell_id="setup",
        ),
        _markdown_cell("## Required data checks", cell_id="required_data_checks"),
        _code_cell(
            "review = review_idea_against_schema(IDEA, schema) if schema is not None else None\n"
            "print(None if review is None else review.to_dict())\n"
            "assert review is None or review.accepted, review.to_dict()\n",
            cell_id="schema_review",
        ),
        _markdown_cell(
            "## Exploration\n\nThe code agent can freely add cells below this point.",
            cell_id="exploration",
        ),
        _code_cell(analysis_code or _default_analysis_code(), cell_id="exploration_code"),
        _markdown_cell(
            "## Runner verification summary\n\n"
            "This scaffolded section is generated by U-Chrom. The notebook agent "
            "executes it after exploration, and the runner re-executes it during "
            "final verification.",
            cell_id="verification_summary",
        ),
        _code_cell(
            verification_code or _default_verification_code(),
            cell_id="verification",
        ),
    ]
    nb = {
        "cells": cells,
        "metadata": {
            "kernelspec": {
                "display_name": kernel_name,
                "language": "python",
                "name": kernel_name,
            },
            "language_info": {"name": "python"},
            "uchrom_auto_discovery": {
                "idea_id": idea.idea_id,
                "idea_title": idea.idea_title,
                "h5cd_path": None if h5cd_path is None else str(h5cd_path),
            },
        },
        "nbformat": 4,
        "nbformat_minor": 5,
    }
    output_path.write_text(json.dumps(nb, indent=2))
    return output_path




[docs]
def execute_notebook_python(
    notebook_path: str | Path,
    *,
    stop_on_error: bool = False,
) -> dict[str, Any]:
    """Execute Python code cells in a notebook JSON file.

    This lightweight executor is intentionally small: it keeps a shared
    Python namespace across code cells, captures stdout/stderr/error text,
    writes outputs back into the notebook, and returns the final namespace
    entries commonly used by the auto-discovery runner.  It is meant for
    deterministic smoke tests; Pantheon/Jupyter can still execute the same
    notebooks in richer interactive runs.
    """
    notebook_path = Path(notebook_path)
    nb = json.loads(notebook_path.read_text())
    _configure_matplotlib_backend()
    namespace: dict[str, Any] = {
        "__name__": "__uchrom_auto_discovery_notebook__",
        "__file__": str(notebook_path),
        "display": _display,
    }
    ok = True
    errors: list[dict[str, str]] = []
    execution_count = 0
    for cell in nb.get("cells", []):
        if cell.get("cell_type") != "code":
            continue
        execution_count += 1
        cell["execution_count"] = execution_count
        source = "".join(cell.get("source", []))
        before_figures = _current_matplotlib_figures()
        stdout = StringIO()
        stderr = StringIO()
        outputs = []
        try:
            with redirect_stdout(stdout), redirect_stderr(stderr):
                exec(compile(source, str(notebook_path), "exec"), namespace)
        except Exception as exc:  # pragma: no cover - exercised by runner failures
            ok = False
            tb = traceback.format_exc()
            errors.append({"type": type(exc).__name__, "message": str(exc), "traceback": tb})
            outputs.append({
                "output_type": "error",
                "ename": type(exc).__name__,
                "evalue": str(exc),
                "traceback": tb.splitlines(),
            })
            if stop_on_error:
                cell["outputs"] = outputs
                break
        if stdout.getvalue():
            outputs.append({
                "output_type": "stream",
                "name": "stdout",
                "text": stdout.getvalue().splitlines(keepends=True),
            })
        if stderr.getvalue():
            outputs.append({
                "output_type": "stream",
                "name": "stderr",
                "text": stderr.getvalue().splitlines(keepends=True),
            })
        outputs.extend(_capture_new_matplotlib_figures(before_figures))
        cell["outputs"] = outputs
    notebook_path.write_text(json.dumps(nb, indent=2, default=str))
    return {
        "ok": ok,
        "errors": errors,
        "verification": namespace.get("verification"),
        "result_table": namespace.get("result_table"),
        "namespace_keys": sorted(k for k in namespace if not k.startswith("__")),
    }




[docs]
def upsert_structured_conclusion(
    notebook_path: str | Path,
    idea: DiscoveryIdea | Mapping[str, Any],
    verification: Mapping[str, Any] | None,
) -> Path:
    """Insert or replace the notebook's final interpretation with standard text."""
    notebook_path = Path(notebook_path)
    nb = json.loads(notebook_path.read_text())
    source = structured_conclusion_markdown(idea, verification).splitlines(keepends=True)
    cells = nb.get("cells", [])
    target_index = None
    for idx, cell in enumerate(cells):
        if cell.get("id") == "structured_conclusion":
            target_index = idx
            break
    if target_index is None:
        for idx, cell in enumerate(cells):
            if cell.get("cell_type") != "markdown":
                continue
            text = "".join(cell.get("source", [])).strip().lower()
            if text.startswith("## final interpretation") or text.startswith("# final interpretation"):
                target_index = idx
                break
    cell = {
        "cell_type": "markdown",
        "id": "structured_conclusion",
        "metadata": {"generated_by": "uchrom_auto_discovery"},
        "source": source,
    }
    if target_index is not None:
        cells[target_index] = cell
    else:
        insert_at = len(cells)
        for idx, existing in enumerate(cells):
            if existing.get("id") == "verification":
                insert_at = idx + 1
                break
        cells.insert(insert_at, cell)
    notebook_path.write_text(json.dumps(nb, indent=2, default=str))
    return notebook_path



def _configure_matplotlib_backend() -> None:
    """Use a non-interactive matplotlib backend for batch notebook execution."""
    os.environ.setdefault("MPLBACKEND", "Agg")
    try:
        import matplotlib

        matplotlib.use("Agg", force=True)
    except Exception:
        return


def _display(*objects: Any, **_: Any) -> None:
    """Small IPython ``display`` stand-in for deterministic smoke execution."""
    for obj in objects:
        print(repr(obj))


def _current_matplotlib_figures() -> set[int]:
    plt = sys.modules.get("matplotlib.pyplot")
    if plt is None or not hasattr(plt, "get_fignums"):
        return set()
    try:
        return set(plt.get_fignums())
    except Exception:
        return set()


def _capture_new_matplotlib_figures(before_figures: set[int]) -> list[dict[str, Any]]:
    plt = sys.modules.get("matplotlib.pyplot")
    if plt is None or not hasattr(plt, "get_fignums"):
        return []
    outputs: list[dict[str, Any]] = []
    try:
        new_figures = [num for num in plt.get_fignums() if num not in before_figures]
        for num in new_figures:
            fig = plt.figure(num)
            buf = BytesIO()
            fig.savefig(buf, format="png", bbox_inches="tight", dpi=140)
            data = base64.b64encode(buf.getvalue()).decode("ascii")
            outputs.append({
                "output_type": "display_data",
                "data": {"image/png": data, "text/plain": ["<Figure size>"]},
                "metadata": {},
            })
            plt.close(fig)
    except Exception:
        return outputs
    return outputs


def _markdown_cell(source: str, *, cell_id: str | None = None) -> dict[str, Any]:
    return {
        "cell_type": "markdown",
        **({"id": cell_id} if cell_id else {}),
        "metadata": {},
        "source": source.splitlines(keepends=True),
    }



[docs]
def idea_to_markdown(idea: DiscoveryIdea | Mapping[str, Any]) -> str:
    """Return a readable Markdown brief for a discovery idea."""
    if not isinstance(idea, DiscoveryIdea):
        idea = DiscoveryIdea.from_dict(idea)
    if idea.idea_markdown.strip():
        return idea.idea_markdown.strip()
    return "\n\n".join([
        f"# Auto-discovery idea: {idea.idea_title}",
        f"**Hypothesis.** {idea.biological_hypothesis}",
        f"**Computable parameter.** {idea.computable_parameter}",
        f"**Analysis plan.** {idea.analysis_plan}",
        f"**Expected direction.** {idea.expected_direction or 'Not specified.'}",
        "**Modalities.** " + ", ".join(idea.modalities),
        "**Cell types.** " + (", ".join(idea.cell_types) if idea.cell_types else "Any compatible cell type"),
        "**Required fields.** " + ", ".join(f"`{field}`" for field in idea.required_fields),
        "**Validation checks.** " + ", ".join(f"`{check}`" for check in idea.validation_checks),
    ]).strip()



def _idea_markdown_cell(idea: DiscoveryIdea) -> str:
    markdown = idea_to_markdown(idea)
    title = f"# Auto-discovery idea: {idea.idea_title}"
    if idea.idea_title.lower() in markdown[:300].lower():
        return markdown
    return f"{title}\n\n{markdown}"


def _code_cell(source: str, *, cell_id: str | None = None) -> dict[str, Any]:
    return {
        "cell_type": "code",
        **({"id": cell_id} if cell_id else {}),
        "execution_count": None,
        "metadata": {},
        "outputs": [],
        "source": source.splitlines(keepends=True),
    }


def _default_analysis_code() -> str:
    return (
        "# Free-form analysis area.\n"
        "# Suggested outputs: result_table, figures, and verification metrics.\n"
        "result_table = None\n"
        "figures = []\n"
    )


def _default_verification_code() -> str:
    return (
        "verification = {\n"
        "    'idea_id': IDEA.idea_id,\n"
        "    'accepted_by_schema_review': None if review is None else review.accepted,\n"
        "    'checks': {check: 'not_run' for check in IDEA.validation_checks},\n"
        "    'status': 'draft',\n"
        "    'notes': [],\n"
        "}\n"
        "print(json.dumps(verification, indent=2))\n"
    )