Source code for uchrom.auto_discovery.ideas

"""Idea records and schema-based review for auto-discovery."""

from __future__ import annotations

import hashlib
import json
import re
from dataclasses import asdict, dataclass, field
from typing import Any, Mapping


[docs] @dataclass class DiscoveryIdea: """A computable multi-omics discovery idea.""" idea_title: str biological_hypothesis: str computable_parameter: str analysis_plan: str modalities: list[str] idea_markdown: str = "" cell_types: list[str] = field(default_factory=list) required_fields: list[str] = field(default_factory=list) validation_checks: list[str] = field(default_factory=list) expected_direction: str = "" complexity: int = 3 idea_id: str = "" metadata: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: self.idea_markdown = str(self.idea_markdown or "") self.modalities = [str(x) for x in self.modalities] self.cell_types = [str(x) for x in self.cell_types] self.required_fields = [str(x) for x in self.required_fields] self.validation_checks = [str(x) for x in self.validation_checks] self.complexity = int(self.complexity) if not 1 <= self.complexity <= 5: raise ValueError("complexity must be between 1 and 5") if not self.idea_id: self.idea_id = stable_idea_id(self.to_dict(include_id=False))
[docs] def to_dict(self, *, include_id: bool = True) -> dict[str, Any]: data = asdict(self) if not include_id: data.pop("idea_id", None) return data
[docs] @classmethod def from_dict(cls, data: Mapping[str, Any]) -> "DiscoveryIdea": return cls(**dict(data))
[docs] @dataclass class IdeaReview: """Result of reviewing an idea against a discovery schema.""" accepted: bool errors: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) missing_fields: list[str] = field(default_factory=list)
[docs] def to_dict(self) -> dict[str, Any]: return asdict(self)
[docs] def stable_idea_id(idea: Mapping[str, Any]) -> str: """Return a stable short id for an idea dict.""" title = str(idea.get("idea_title", "idea")).lower() slug = re.sub(r"[^a-z0-9]+", "-", title).strip("-")[:48] or "idea" payload = json.dumps(idea, sort_keys=True, default=str) digest = hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10] return f"{slug}-{digest}"
[docs] def review_idea_against_schema( idea: DiscoveryIdea | Mapping[str, Any], schema: Mapping[str, Any], *, min_complexity: int = 1, max_complexity: int = 5, ) -> IdeaReview: """Check whether an idea is compatible with a discovery schema.""" if not isinstance(idea, DiscoveryIdea): idea = DiscoveryIdea.from_dict(idea) errors: list[str] = [] warnings: list[str] = [] missing_fields: list[str] = [] if not idea.idea_title.strip(): errors.append("idea_title is empty") if not idea.biological_hypothesis.strip(): errors.append("biological_hypothesis is empty") if not idea.computable_parameter.strip(): errors.append("computable_parameter is empty") if not idea.analysis_plan.strip(): errors.append("analysis_plan is empty") if not (min_complexity <= idea.complexity <= max_complexity): errors.append(f"complexity {idea.complexity} outside allowed range") modalities = schema.get("modalities", {}) for modality in idea.modalities: info = modalities.get(modality) if info is None: errors.append(f"unknown modality: {modality}") elif not info.get("present"): errors.append(f"modality is not present in this h5cd: {modality}") known_cell_types = set((schema.get("catalogs", {}).get("cell_types", {}) or {}).get("values", [])) for cell_type in idea.cell_types: if known_cell_types and cell_type not in known_cell_types: errors.append(f"unknown cell_type: {cell_type}") for field_name in idea.required_fields: if not _field_exists(field_name, schema): missing_fields.append(field_name) if missing_fields: errors.append("required field(s) not found: " + ", ".join(missing_fields)) if not idea.validation_checks: warnings.append("idea has no explicit validation_checks") if "rna_expression" in idea.modalities and "linked_adata.X" not in idea.required_fields: errors.append("rna_expression modality requires linked_adata.X in required_fields") if len(idea.modalities) >= 2 and "cell_id_alignment" not in idea.validation_checks: warnings.append("multi-modal idea should include a cell_id_alignment validation check") return IdeaReview( accepted=not errors, errors=errors, warnings=warnings, missing_fields=missing_fields, )
def _field_exists(field_name: str, schema: Mapping[str, Any]) -> bool: if field_name == "coords": return bool(schema.get("fields", {}).get("coords")) fields = schema.get("fields", {}) catalogs = schema.get("catalogs", {}) if "." not in field_name: return field_name in fields prefix, suffix = field_name.split(".", 1) if prefix in {"spots", "tracks", "cells", "traces"}: return suffix in (fields.get(prefix, {}) or {}).get("columns", []) if prefix in {"cellm", "layers"}: return suffix in (fields.get(prefix, {}) or {}).get("keys", []) if prefix == "linked_adata": linked = schema.get("linked_adata", {}) if not linked.get("present"): return False if suffix in {"X", "obs", "var"}: return True if suffix.startswith("obs."): return suffix[4:] in linked.get("obs_columns", []) if suffix.startswith("var."): gene = suffix[4:] gene_catalog = catalogs.get("genes", {}) or {} return gene in gene_catalog.get("values", []) if suffix.startswith("layers."): return suffix[7:] in linked.get("layers", []) return False