Skip to content

Commit 99b5601

Browse files
dhellmannclaude
andcommitted
feat(graph): add suggest-base subcommand with collection impact analysis
Analyzes multiple collection graph files to find packages that appear across >= N collections. These shared packages are candidates for a "base" collection built once and reused across parallel collection builds. Also computes collection impact analysis showing how the proposed base would affect each input collection: remaining package counts per collection and cross-collection counts for remaining packages, helping identify candidates for a secondary base. New public command: fromager graph suggest-base GRAPH [GRAPH ...] Options: --min-collections INT threshold (default: 50% of input graphs, rounded up) --base PATH mark packages already in an existing base graph --format table|json output format (default: table) Implementation: - _get_collection_packages(): load graph, return canonical names - _find_shared_packages(): find overlap, sort by count desc then name asc - _compute_collection_impact(): per-collection remaining package analysis - _suggest_base_table(): rich MARKDOWN table output with impact sections - _suggest_base_json(): structured JSON with metadata, candidates, and collection_impact key - _suggest_base_impl(): testable core extracted from the click command Tests: 13 new unit tests covering helpers, table/JSON output, --base flag, impact analysis, and error cases. Closes: #973 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Doug Hellmann <dhellmann@redhat.com>
1 parent 06e6317 commit 99b5601

2 files changed

Lines changed: 689 additions & 2 deletions

File tree

src/fromager/commands/graph.py

Lines changed: 336 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
import itertools
33
import json
44
import logging
5+
import math
56
import pathlib
67
import sys
78
import typing
89

910
import click
11+
import rich
12+
import rich.box
1013
from packaging.requirements import Requirement
11-
from packaging.utils import canonicalize_name
14+
from packaging.utils import NormalizedName, canonicalize_name
1215
from packaging.version import Version
16+
from rich.table import Table
1317

1418
from fromager import clickext, context
1519
from fromager.commands import bootstrap
@@ -784,3 +788,334 @@ def n2s(nodes: typing.Iterable[DependencyNode]) -> str:
784788
topo.done(*nodes_to_build)
785789

786790
print(f"\nBuilding {len(graph)} packages in {rounds} rounds.")
791+
792+
793+
def _get_collection_name(graph_path: str) -> str:
794+
"""Derive collection name from file path stem."""
795+
return pathlib.Path(graph_path).stem
796+
797+
798+
def _get_collection_packages(graph_path: str) -> set[NormalizedName]:
799+
"""Load graph and return all canonical package names, excluding ROOT."""
800+
graph = DependencyGraph.from_file(graph_path)
801+
return {
802+
node.canonicalized_name
803+
for node in graph.get_all_nodes()
804+
if node.canonicalized_name != ROOT
805+
}
806+
807+
808+
def _find_shared_packages(
809+
collections: dict[str, set[NormalizedName]],
810+
min_collections: int,
811+
) -> list[dict[str, typing.Any]]:
812+
"""Find packages in >= min_collections collections, sorted by count desc then name asc."""
813+
all_packages: set[NormalizedName] = set().union(*collections.values())
814+
results: list[dict[str, typing.Any]] = []
815+
for pkg in all_packages:
816+
containing = [name for name, pkgs in collections.items() if pkg in pkgs]
817+
if len(containing) >= min_collections:
818+
results.append(
819+
{
820+
"package": pkg,
821+
"collections": sorted(containing),
822+
"count": len(containing),
823+
}
824+
)
825+
results.sort(key=lambda x: (-x["count"], x["package"]))
826+
return results
827+
828+
829+
def _compute_collection_impact(
830+
collections: dict[str, set[NormalizedName]],
831+
base_package_names: set[NormalizedName],
832+
) -> list[dict[str, typing.Any]]:
833+
"""For each collection, compute how many packages remain after removing base packages.
834+
835+
Each entry includes per-remaining-package cross-collection counts.
836+
Sorted by remaining package count descending, then collection name ascending.
837+
"""
838+
all_packages: set[NormalizedName] = set().union(*collections.values())
839+
pkg_counts: dict[NormalizedName, int] = {
840+
pkg: sum(1 for pkgs in collections.values() if pkg in pkgs)
841+
for pkg in all_packages
842+
}
843+
844+
result = []
845+
for coll_name, pkgs in collections.items():
846+
base_pkgs = pkgs & base_package_names
847+
remaining_pkgs = pkgs - base_package_names
848+
remaining_detail = sorted(
849+
[
850+
{"package": pkg, "collection_count": pkg_counts[pkg]}
851+
for pkg in remaining_pkgs
852+
],
853+
key=lambda x: (
854+
-typing.cast(int, x["collection_count"]),
855+
typing.cast(str, x["package"]),
856+
),
857+
)
858+
result.append(
859+
{
860+
"collection": coll_name,
861+
"total_packages": len(pkgs),
862+
"base_packages": len(base_pkgs),
863+
"remaining_packages": len(remaining_pkgs),
864+
"reduction_percentage": (
865+
round(len(base_pkgs) / len(pkgs) * 100, 1) if pkgs else 0.0
866+
),
867+
"remaining": remaining_detail,
868+
}
869+
)
870+
result.sort(
871+
key=lambda x: (
872+
-typing.cast(int, x["remaining_packages"]),
873+
typing.cast(str, x["collection"]),
874+
)
875+
)
876+
return result
877+
878+
879+
def _suggest_base_table(
880+
candidates: list[dict[str, typing.Any]],
881+
total_collections: int,
882+
collection_names: list[str],
883+
min_collections: int,
884+
base_packages: set[NormalizedName] | None,
885+
total_unique_packages: int,
886+
impact: list[dict[str, typing.Any]],
887+
) -> None:
888+
"""Display suggest-base results as a rich table."""
889+
title = (
890+
f"Base collection candidates "
891+
f"(threshold: {min_collections}/{total_collections} collections)\n"
892+
f"Collections: {', '.join(sorted(collection_names))}"
893+
)
894+
table = Table(title=title, box=rich.box.MARKDOWN, title_justify="left")
895+
table.add_column("Package", justify="left", no_wrap=True)
896+
table.add_column("Collections", justify="right", no_wrap=True)
897+
table.add_column("Coverage", justify="right", no_wrap=True)
898+
table.add_column("Appears In", justify="left")
899+
if base_packages is not None:
900+
table.add_column("In Base", justify="center", no_wrap=True)
901+
902+
already_in_base = 0
903+
new_candidates = 0
904+
for entry in candidates:
905+
pkg = entry["package"]
906+
count = entry["count"]
907+
cols = entry["collections"]
908+
coverage = f"{(count / total_collections) * 100:.1f}%"
909+
count_str = f"{count}/{total_collections}"
910+
appears_in = ", ".join(cols)
911+
if base_packages is not None:
912+
in_base = pkg in base_packages
913+
if in_base:
914+
already_in_base += 1
915+
else:
916+
new_candidates += 1
917+
table.add_row(
918+
pkg, count_str, coverage, appears_in, "yes" if in_base else "no"
919+
)
920+
else:
921+
new_candidates += 1
922+
table.add_row(pkg, count_str, coverage, appears_in)
923+
924+
console = rich.get_console()
925+
console.print(table)
926+
console.print(f"\nTotal unique packages: {total_unique_packages}")
927+
console.print(f"Packages in >= {min_collections} collections: {len(candidates)}")
928+
if base_packages is not None:
929+
console.print(f"Already in base: {already_in_base}")
930+
console.print(f"New candidates: {new_candidates}")
931+
932+
# Collection Impact table
933+
impact_table = Table(
934+
title="Collection Impact", box=rich.box.MARKDOWN, title_justify="left"
935+
)
936+
impact_table.add_column("Collection", justify="left", no_wrap=True)
937+
impact_table.add_column("Total Pkgs", justify="right", no_wrap=True)
938+
impact_table.add_column("In Base", justify="right", no_wrap=True)
939+
impact_table.add_column("Remaining", justify="right", no_wrap=True)
940+
impact_table.add_column("% Saved", justify="right", no_wrap=True)
941+
for entry in impact:
942+
impact_table.add_row(
943+
entry["collection"],
944+
str(entry["total_packages"]),
945+
str(entry["base_packages"]),
946+
str(entry["remaining_packages"]),
947+
f"{entry['reduction_percentage']:.1f}%",
948+
)
949+
console.print(impact_table)
950+
951+
# Remaining Packages table — deduplicated across all collections
952+
seen: set[NormalizedName] = set()
953+
remaining_rows: list[dict[str, typing.Any]] = []
954+
for entry in impact:
955+
for pkg_entry in entry["remaining"]:
956+
pkg = pkg_entry["package"]
957+
if pkg not in seen:
958+
seen.add(pkg)
959+
remaining_rows.append(pkg_entry)
960+
remaining_rows.sort(key=lambda x: (-x["collection_count"], x["package"]))
961+
962+
remaining_table = Table(
963+
title="Remaining Packages (not in proposed base)",
964+
box=rich.box.MARKDOWN,
965+
title_justify="left",
966+
)
967+
remaining_table.add_column("Package", justify="left", no_wrap=True)
968+
remaining_table.add_column("Collections", justify="right", no_wrap=True)
969+
remaining_table.add_column("Coverage", justify="right", no_wrap=True)
970+
for pkg_entry in remaining_rows:
971+
count = pkg_entry["collection_count"]
972+
remaining_table.add_row(
973+
pkg_entry["package"],
974+
f"{count}/{total_collections}",
975+
f"{(count / total_collections) * 100:.1f}%",
976+
)
977+
console.print(remaining_table)
978+
979+
980+
def _suggest_base_json(
981+
candidates: list[dict[str, typing.Any]],
982+
total_collections: int,
983+
collection_names: list[str],
984+
min_collections: int,
985+
base_packages: set[NormalizedName] | None,
986+
base_graph: str | None,
987+
total_unique_packages: int,
988+
impact: list[dict[str, typing.Any]],
989+
) -> None:
990+
"""Display suggest-base results as JSON."""
991+
output: dict[str, typing.Any] = {
992+
"metadata": {
993+
"total_collections": total_collections,
994+
"total_unique_packages": total_unique_packages,
995+
"packages_meeting_threshold": len(candidates),
996+
"collections": sorted(collection_names),
997+
"min_collections": min_collections,
998+
},
999+
"candidates": [],
1000+
"collection_impact": impact,
1001+
}
1002+
if base_graph is not None:
1003+
output["metadata"]["base_graph"] = base_graph
1004+
1005+
for entry in candidates:
1006+
pkg = entry["package"]
1007+
count = entry["count"]
1008+
cols = entry["collections"]
1009+
candidate: dict[str, typing.Any] = {
1010+
"package": pkg,
1011+
"collections": cols,
1012+
"collection_count": count,
1013+
"coverage_percentage": round((count / total_collections) * 100, 1),
1014+
}
1015+
if base_packages is not None:
1016+
candidate["in_base"] = pkg in base_packages
1017+
output["candidates"].append(candidate)
1018+
1019+
json.dump(output, sys.stdout, indent=2)
1020+
1021+
1022+
def _suggest_base_impl(
1023+
collection_graphs: tuple[str, ...],
1024+
base_graph: str | None,
1025+
min_collections: int | None,
1026+
output_format: str,
1027+
) -> None:
1028+
"""Core implementation for suggest_base, testable without a click context."""
1029+
if len(collection_graphs) < 2:
1030+
raise click.UsageError("At least 2 collection graphs are required")
1031+
if min_collections is None:
1032+
min_collections = max(2, math.ceil(len(collection_graphs) / 2))
1033+
elif min_collections < 2:
1034+
raise click.UsageError("--min-collections must be >= 2")
1035+
if min_collections > len(collection_graphs):
1036+
raise click.UsageError(
1037+
f"--min-collections ({min_collections}) cannot exceed number of graphs ({len(collection_graphs)})"
1038+
)
1039+
1040+
# Load each collection
1041+
collections: dict[str, set[NormalizedName]] = {}
1042+
for path in collection_graphs:
1043+
name = _get_collection_name(path)
1044+
pkgs = _get_collection_packages(path)
1045+
if not pkgs:
1046+
logger.warning("Collection %s is empty, skipping", name)
1047+
continue
1048+
collections[name] = pkgs
1049+
1050+
# Load base graph if provided
1051+
base_packages: set[NormalizedName] | None = None
1052+
if base_graph:
1053+
base_packages = _get_collection_packages(base_graph)
1054+
1055+
total_unique_packages = len(set().union(*collections.values()))
1056+
candidates = _find_shared_packages(collections, min_collections)
1057+
total = len(collections)
1058+
1059+
base_package_names: set[NormalizedName] = {entry["package"] for entry in candidates}
1060+
impact = _compute_collection_impact(collections, base_package_names)
1061+
1062+
if output_format == "json":
1063+
_suggest_base_json(
1064+
candidates,
1065+
total,
1066+
list(collections),
1067+
min_collections,
1068+
base_packages,
1069+
base_graph,
1070+
total_unique_packages,
1071+
impact,
1072+
)
1073+
else:
1074+
_suggest_base_table(
1075+
candidates,
1076+
total,
1077+
list(collections),
1078+
min_collections,
1079+
base_packages,
1080+
total_unique_packages,
1081+
impact,
1082+
)
1083+
1084+
1085+
@graph.command()
1086+
@click.option(
1087+
"--base",
1088+
"base_graph",
1089+
type=str,
1090+
default=None,
1091+
help="Existing base collection graph to enhance",
1092+
)
1093+
@click.option(
1094+
"--min-collections",
1095+
type=int,
1096+
default=None,
1097+
help="Minimum collections a package must appear in (default: 50% of provided collections)",
1098+
)
1099+
@click.option(
1100+
"--format",
1101+
"output_format",
1102+
type=click.Choice(["table", "json"]),
1103+
default="table",
1104+
help="Output format (default: table)",
1105+
)
1106+
@click.argument("collection_graphs", nargs=-1, required=True)
1107+
@click.pass_obj
1108+
def suggest_base(
1109+
wkctx: context.WorkContext,
1110+
collection_graphs: tuple[str, ...],
1111+
base_graph: str | None,
1112+
min_collections: int | None,
1113+
output_format: str,
1114+
) -> None:
1115+
"""Suggest packages for a shared base collection.
1116+
1117+
Analyzes COLLECTION_GRAPHS (2 or more graph files) to identify packages
1118+
appearing across multiple collections. These are candidates for factoring
1119+
into a base collection built once and reused.
1120+
"""
1121+
_suggest_base_impl(collection_graphs, base_graph, min_collections, output_format)

0 commit comments

Comments
 (0)