Skip to content

Commit 04b22f8

Browse files
architecture: recover inferred architecture from persisted docs
1 parent 1043a10 commit 04b22f8

20 files changed

+4111
-114
lines changed

packages/core/contextmine_core/architecture/agent_sdk.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from uuid import UUID
1313

1414
from .arc42 import SECTION_TITLES
15+
from .recovery_model import RecoveredArchitectureModel
1516
from .schemas import Arc42Document
1617

1718

@@ -185,13 +186,40 @@ def _render_markdown(title: str, sections: dict[str, str]) -> str:
185186
return "\n".join(lines).strip() + "\n"
186187

187188

188-
def _arc42_prompt(*, scenario_name: str, section: str | None) -> str:
189+
def _recovered_architecture_payload(
190+
recovered_architecture: RecoveredArchitectureModel | dict[str, Any] | None,
191+
) -> dict[str, Any] | None:
192+
if recovered_architecture is None:
193+
return None
194+
if isinstance(recovered_architecture, RecoveredArchitectureModel):
195+
return recovered_architecture.canonical_payload()
196+
if isinstance(recovered_architecture, dict):
197+
return recovered_architecture
198+
raise TypeError("recovered_architecture must be a RecoveredArchitectureModel, dict, or None")
199+
200+
201+
def _arc42_prompt(
202+
*,
203+
scenario_name: str,
204+
section: str | None,
205+
recovered_architecture: RecoveredArchitectureModel | dict[str, Any] | None = None,
206+
) -> str:
189207
section_instruction = (
190208
f"Focus section: {section}. Still return all 12 section keys."
191209
if section
192210
else "No section filter. Return all 12 sections."
193211
)
194212
section_keys = ", ".join(SECTION_TITLES.keys())
213+
recovered_payload = _recovered_architecture_payload(recovered_architecture)
214+
recovered_instruction = ""
215+
if recovered_payload is not None:
216+
recovered_instruction = (
217+
"\n\nRecovered architecture payload is provided below. "
218+
"Reason explicitly over recovered entities, relationships, hypotheses, and decisions. "
219+
"Do not hide ambiguity: if recovered hypotheses are ambiguous or unresolved, carry that uncertainty into the output.\n"
220+
"Treat the payload as evidence-backed architecture context; do not invent facts beyond it or the repository evidence.\n"
221+
f"Recovered architecture JSON:\n{json.dumps(recovered_payload, sort_keys=True)}"
222+
)
195223
return (
196224
"Generate a real arc42 document from repository evidence using tools. "
197225
"Do not invent facts. If evidence is missing, write exactly "
@@ -208,6 +236,7 @@ def _arc42_prompt(*, scenario_name: str, section: str | None) -> str:
208236
"}\n\n"
209237
f"Mandatory section keys: {section_keys}\n"
210238
"No Markdown fences. JSON only."
239+
f"{recovered_instruction}"
211240
)
212241

213242

@@ -218,6 +247,7 @@ async def generate_arc42_with_claude_sdk(
218247
scenario_name: str,
219248
repo_path: Path,
220249
section: str | None = None,
250+
recovered_architecture: RecoveredArchitectureModel | dict[str, Any] | None = None,
221251
model: str = "claude-sonnet-4-5-20250929",
222252
max_turns: int = 50,
223253
permission_mode: str = "bypassPermissions",
@@ -228,7 +258,11 @@ async def generate_arc42_with_claude_sdk(
228258
raise FileNotFoundError(f"Repository path does not exist: {repo_path}")
229259

230260
scope_key = f"arc42:{collection_id}:{scenario_id}"
231-
prompt = _arc42_prompt(scenario_name=scenario_name, section=section)
261+
prompt = _arc42_prompt(
262+
scenario_name=scenario_name,
263+
section=section,
264+
recovered_architecture=recovered_architecture,
265+
)
232266
raw_output, runtime_meta = await _SESSION_MANAGER.run_prompt(
233267
repo_path=repo_path,
234268
scope_key=scope_key,

packages/core/contextmine_core/architecture/facts.py

Lines changed: 220 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,13 @@
1818
MetricSnapshot,
1919
TwinScenario,
2020
)
21-
from contextmine_core.twin import (
22-
GraphProjection,
23-
get_full_scenario_graph,
24-
get_scenario_provenance_node_ids,
25-
)
21+
from contextmine_core.twin import get_full_scenario_graph, get_scenario_provenance_node_ids
2622
from contextmine_core.twin.grouping import canonical_file_path_from_meta, derive_arch_group
2723
from sqlalchemy import select
2824
from sqlalchemy.ext.asyncio import AsyncSession
2925

26+
from .recovery import recover_architecture_model
27+
from .recovery_docs import load_recovery_docs
3028
from .schemas import ArchitectureFact, ArchitectureFactsBundle, EvidenceRef, PortAdapterFact
3129

3230
DETERMINISTIC_CONFIDENCE = 0.9
@@ -62,6 +60,7 @@
6260

6361
_derive_arch_group = derive_arch_group
6462
_canonical_file_path = canonical_file_path_from_meta
63+
_compat_get_full_scenario_graph = get_full_scenario_graph
6564

6665

6766
def _evidence_from_symbol_meta(node: KnowledgeNode) -> tuple[EvidenceRef, ...]:
@@ -248,6 +247,209 @@ def _dedupe_ports(facts: list[PortAdapterFact]) -> list[PortAdapterFact]:
248247
return sorted(by_id.values(), key=lambda row: row.fact_id)
249248

250249

250+
def _kg_nodes_to_recovery_inputs(
251+
nodes: list[KnowledgeNode],
252+
) -> tuple[list[dict[str, Any]], dict[UUID, str]]:
253+
subject_ref_by_id: dict[UUID, str] = {}
254+
recovery_nodes: list[dict[str, Any]] = []
255+
for node in nodes:
256+
subject_ref = str(node.natural_key or node.id)
257+
subject_ref_by_id[node.id] = subject_ref
258+
recovery_nodes.append(
259+
{
260+
"id": subject_ref,
261+
"kind": node.kind,
262+
"name": node.name,
263+
"natural_key": node.natural_key,
264+
"meta": node.meta or {},
265+
}
266+
)
267+
return recovery_nodes, subject_ref_by_id
268+
269+
270+
def _kg_edges_to_recovery_inputs(
271+
edges: list[KnowledgeEdge],
272+
subject_ref_by_id: dict[UUID, str],
273+
) -> list[dict[str, Any]]:
274+
recovery_edges: list[dict[str, Any]] = []
275+
for edge in edges:
276+
source_ref = subject_ref_by_id.get(edge.source_node_id)
277+
target_ref = subject_ref_by_id.get(edge.target_node_id)
278+
if not source_ref or not target_ref:
279+
continue
280+
recovery_edges.append(
281+
{
282+
"source_node_id": source_ref,
283+
"target_node_id": target_ref,
284+
"kind": edge.kind,
285+
"meta": {},
286+
}
287+
)
288+
return recovery_edges
289+
290+
291+
def _recovered_entity_to_fact(entity: Any) -> ArchitectureFact:
292+
return ArchitectureFact(
293+
fact_id=f"recovered_entity:{entity.entity_id}",
294+
fact_type=entity.kind,
295+
title=f"{entity.kind.replace('_', ' ').title()} {entity.name}",
296+
description=f"Recovered {entity.kind.replace('_', ' ')} '{entity.name}'",
297+
source="deterministic",
298+
confidence=float(entity.confidence),
299+
tags=("recovered", entity.kind),
300+
attributes={"entity_id": entity.entity_id, **entity.attributes},
301+
evidence=entity.evidence,
302+
)
303+
304+
305+
def _recovered_relationship_to_fact(relationship: Any) -> ArchitectureFact:
306+
return ArchitectureFact(
307+
fact_id=(
308+
f"recovered_relationship:{relationship.source_entity_id}:"
309+
f"{relationship.kind}:{relationship.target_entity_id}"
310+
),
311+
fact_type="recovered_relationship",
312+
title="Recovered relationship",
313+
description=(
314+
f"{relationship.source_entity_id} {relationship.kind} {relationship.target_entity_id}"
315+
),
316+
source="deterministic",
317+
confidence=float(relationship.confidence),
318+
tags=("recovered", "relationship", relationship.kind),
319+
attributes={
320+
"source_entity_id": relationship.source_entity_id,
321+
"target_entity_id": relationship.target_entity_id,
322+
"relationship_kind": relationship.kind,
323+
**relationship.attributes,
324+
},
325+
evidence=relationship.evidence,
326+
)
327+
328+
329+
def _recovered_hypothesis_to_fact(hypothesis: Any) -> ArchitectureFact:
330+
return ArchitectureFact(
331+
fact_id=f"recovered_hypothesis:{hypothesis.subject_ref}",
332+
fact_type="recovered_hypothesis",
333+
title=f"Recovered hypothesis for {hypothesis.subject_ref}",
334+
description=hypothesis.rationale,
335+
source="hybrid",
336+
confidence=float(hypothesis.confidence),
337+
tags=("recovered", "hypothesis", hypothesis.status),
338+
attributes={
339+
"subject_ref": hypothesis.subject_ref,
340+
"candidate_entity_ids": list(hypothesis.candidate_entity_ids),
341+
"selected_entity_ids": list(hypothesis.selected_entity_ids),
342+
"status": hypothesis.status,
343+
},
344+
evidence=hypothesis.evidence,
345+
)
346+
347+
348+
def _collect_recovered_architecture_facts(bundle: ArchitectureFactsBundle, model: Any) -> None:
349+
for entity in model.entities:
350+
bundle.facts.append(_recovered_entity_to_fact(entity))
351+
for relationship in model.relationships:
352+
bundle.facts.append(_recovered_relationship_to_fact(relationship))
353+
for hypothesis in model.hypotheses:
354+
bundle.facts.append(_recovered_hypothesis_to_fact(hypothesis))
355+
for decision in getattr(model, "decisions", ()):
356+
bundle.facts.append(
357+
ArchitectureFact(
358+
fact_id=f"recovered_decision:{decision.title}",
359+
fact_type="architecture_decision",
360+
title=decision.title,
361+
description=decision.summary,
362+
source="deterministic",
363+
confidence=float(decision.confidence),
364+
tags=("recovered", "decision", decision.status),
365+
attributes={
366+
"affected_entity_ids": list(decision.affected_entity_ids),
367+
"status": decision.status,
368+
},
369+
evidence=decision.evidence,
370+
)
371+
)
372+
373+
374+
def _entity_kind(entity_id: str) -> str:
375+
return entity_id.split(":", 1)[0]
376+
377+
378+
def _best_membership(memberships: list[Any], kind: str) -> Any | None:
379+
candidates = [
380+
membership for membership in memberships if _entity_kind(membership.entity_id) == kind
381+
]
382+
if not candidates:
383+
return None
384+
return sorted(candidates, key=lambda row: (-float(row.confidence), row.entity_id))[0]
385+
386+
387+
def _enrich_ports_with_recovery(ports: list[PortAdapterFact], model: Any) -> list[PortAdapterFact]:
388+
entity_name_by_id = {entity.entity_id: entity.name for entity in model.entities}
389+
enriched: list[PortAdapterFact] = []
390+
for port in ports:
391+
subject_refs = [
392+
str(port.attributes.get("natural_key") or "").strip(),
393+
str(port.attributes.get("source_natural_key") or "").strip(),
394+
]
395+
memberships: list[Any] = []
396+
for subject_ref in subject_refs:
397+
if not subject_ref:
398+
continue
399+
memberships.extend(model.memberships_for(subject_ref))
400+
401+
if not memberships:
402+
enriched.append(port)
403+
continue
404+
405+
candidate_memberships = sorted({membership.entity_id for membership in memberships})
406+
best_container = _best_membership(memberships, "container")
407+
best_component = _best_membership(memberships, "component")
408+
409+
container = port.container
410+
if best_container is not None:
411+
container = best_container.entity_id.split(":", 1)[1]
412+
413+
component = port.component
414+
if best_component is not None:
415+
component = entity_name_by_id.get(best_component.entity_id, component)
416+
417+
enriched.append(
418+
replace(
419+
port,
420+
container=container,
421+
component=component,
422+
confidence=max(
423+
float(port.confidence),
424+
max(float(membership.confidence) for membership in memberships),
425+
),
426+
attributes={
427+
**port.attributes,
428+
"candidate_memberships": candidate_memberships,
429+
},
430+
)
431+
)
432+
return enriched
433+
434+
435+
def _append_recovery_warning_counts(bundle: ArchitectureFactsBundle, model: Any) -> None:
436+
unresolved_count = sum(
437+
1 for hypothesis in model.hypotheses if hypothesis.status == "unresolved"
438+
)
439+
rejected_count = sum(
440+
1 for warning in model.warnings if "rejected adjudication" in warning.lower()
441+
)
442+
missing_packet_count = sum(
443+
1 for warning in model.warnings if "missing evidence packet" in warning.lower()
444+
)
445+
if unresolved_count:
446+
bundle.warnings.append(f"unresolved_hypotheses={unresolved_count}")
447+
if rejected_count:
448+
bundle.warnings.append(f"rejected_llm_adjudications={rejected_count}")
449+
if missing_packet_count:
450+
bundle.warnings.append(f"missing_evidence_packets={missing_packet_count}")
451+
452+
251453
def _collect_container_facts(bundle: ArchitectureFactsBundle, container_graph: dict) -> None:
252454
"""Add container facts from the architecture graph."""
253455
for node in container_graph["nodes"]:
@@ -386,26 +588,6 @@ async def build_architecture_facts(
386588
"ARCH_DOCS_LLM_ENRICH is enabled but no LLM provider is available; using deterministic fallback."
387589
)
388590

389-
container_graph = await get_full_scenario_graph(
390-
session=session,
391-
scenario_id=scenario_id,
392-
layer=None,
393-
projection=GraphProjection.ARCHITECTURE,
394-
entity_level="container",
395-
include_kinds={"file"},
396-
)
397-
component_graph = await get_full_scenario_graph(
398-
session=session,
399-
scenario_id=scenario_id,
400-
layer=None,
401-
projection=GraphProjection.ARCHITECTURE,
402-
entity_level="component",
403-
include_kinds={"file"},
404-
)
405-
406-
_collect_container_facts(bundle, container_graph)
407-
_collect_component_facts(bundle, component_graph)
408-
409591
await _collect_c4_view_facts(session, scenario_id, bundle)
410592

411593
metrics = (
@@ -506,6 +688,18 @@ async def build_architecture_facts(
506688
.scalars()
507689
.all()
508690
)
691+
recovery_nodes, recovery_subject_refs = _kg_nodes_to_recovery_inputs(kg_nodes)
692+
recovery_edges = _kg_edges_to_recovery_inputs(kg_edges, recovery_subject_refs)
693+
recovery_docs = await load_recovery_docs(session, kg_nodes)
694+
recovered_model = recover_architecture_model(
695+
recovery_nodes,
696+
recovery_edges,
697+
docs=recovery_docs,
698+
llm_adjudicator=llm_provider if enable_llm_enrich else None,
699+
)
700+
_collect_recovered_architecture_facts(bundle, recovered_model)
701+
_append_recovery_warning_counts(bundle, recovered_model)
702+
509703
outbound_edges = [
510704
edge
511705
for edge in kg_edges
@@ -531,6 +725,7 @@ async def build_architecture_facts(
531725
)
532726
ports = enriched
533727

728+
ports = _enrich_ports_with_recovery(ports, recovered_model)
534729
bundle.ports_adapters = _dedupe_ports(ports)
535730

536731
return bundle

0 commit comments

Comments
 (0)