|
2 | 2 | import itertools |
3 | 3 | import json |
4 | 4 | import logging |
| 5 | +import math |
5 | 6 | import pathlib |
6 | 7 | import sys |
7 | 8 | import typing |
8 | 9 |
|
9 | 10 | import click |
| 11 | +import rich |
| 12 | +import rich.box |
10 | 13 | from packaging.requirements import Requirement |
11 | | -from packaging.utils import canonicalize_name |
| 14 | +from packaging.utils import NormalizedName, canonicalize_name |
12 | 15 | from packaging.version import Version |
| 16 | +from rich.table import Table |
13 | 17 |
|
14 | 18 | from fromager import clickext, context |
15 | 19 | from fromager.commands import bootstrap |
@@ -784,3 +788,334 @@ def n2s(nodes: typing.Iterable[DependencyNode]) -> str: |
784 | 788 | topo.done(*nodes_to_build) |
785 | 789 |
|
786 | 790 | print(f"\nBuilding {len(graph)} packages in {rounds} rounds.") |
| 791 | + |
| 792 | + |
| 793 | +def _get_collection_name(graph_path: str) -> str: |
| 794 | + """Derive collection name from file path stem.""" |
| 795 | + return pathlib.Path(graph_path).stem |
| 796 | + |
| 797 | + |
| 798 | +def _get_collection_packages(graph_path: str) -> set[NormalizedName]: |
| 799 | + """Load graph and return all canonical package names, excluding ROOT.""" |
| 800 | + graph = DependencyGraph.from_file(graph_path) |
| 801 | + return { |
| 802 | + node.canonicalized_name |
| 803 | + for node in graph.get_all_nodes() |
| 804 | + if node.canonicalized_name != ROOT |
| 805 | + } |
| 806 | + |
| 807 | + |
| 808 | +def _find_shared_packages( |
| 809 | + collections: dict[str, set[NormalizedName]], |
| 810 | + min_collections: int, |
| 811 | +) -> list[dict[str, typing.Any]]: |
| 812 | + """Find packages in >= min_collections collections, sorted by count desc then name asc.""" |
| 813 | + all_packages: set[NormalizedName] = set().union(*collections.values()) |
| 814 | + results: list[dict[str, typing.Any]] = [] |
| 815 | + for pkg in all_packages: |
| 816 | + containing = [name for name, pkgs in collections.items() if pkg in pkgs] |
| 817 | + if len(containing) >= min_collections: |
| 818 | + results.append( |
| 819 | + { |
| 820 | + "package": pkg, |
| 821 | + "collections": sorted(containing), |
| 822 | + "count": len(containing), |
| 823 | + } |
| 824 | + ) |
| 825 | + results.sort(key=lambda x: (-x["count"], x["package"])) |
| 826 | + return results |
| 827 | + |
| 828 | + |
| 829 | +def _compute_collection_impact( |
| 830 | + collections: dict[str, set[NormalizedName]], |
| 831 | + base_package_names: set[NormalizedName], |
| 832 | +) -> list[dict[str, typing.Any]]: |
| 833 | + """For each collection, compute how many packages remain after removing base packages. |
| 834 | +
|
| 835 | + Each entry includes per-remaining-package cross-collection counts. |
| 836 | + Sorted by remaining package count descending, then collection name ascending. |
| 837 | + """ |
| 838 | + all_packages: set[NormalizedName] = set().union(*collections.values()) |
| 839 | + pkg_counts: dict[NormalizedName, int] = { |
| 840 | + pkg: sum(1 for pkgs in collections.values() if pkg in pkgs) |
| 841 | + for pkg in all_packages |
| 842 | + } |
| 843 | + |
| 844 | + result = [] |
| 845 | + for coll_name, pkgs in collections.items(): |
| 846 | + base_pkgs = pkgs & base_package_names |
| 847 | + remaining_pkgs = pkgs - base_package_names |
| 848 | + remaining_detail = sorted( |
| 849 | + [ |
| 850 | + {"package": pkg, "collection_count": pkg_counts[pkg]} |
| 851 | + for pkg in remaining_pkgs |
| 852 | + ], |
| 853 | + key=lambda x: ( |
| 854 | + -typing.cast(int, x["collection_count"]), |
| 855 | + typing.cast(str, x["package"]), |
| 856 | + ), |
| 857 | + ) |
| 858 | + result.append( |
| 859 | + { |
| 860 | + "collection": coll_name, |
| 861 | + "total_packages": len(pkgs), |
| 862 | + "base_packages": len(base_pkgs), |
| 863 | + "remaining_packages": len(remaining_pkgs), |
| 864 | + "reduction_percentage": ( |
| 865 | + round(len(base_pkgs) / len(pkgs) * 100, 1) if pkgs else 0.0 |
| 866 | + ), |
| 867 | + "remaining": remaining_detail, |
| 868 | + } |
| 869 | + ) |
| 870 | + result.sort( |
| 871 | + key=lambda x: ( |
| 872 | + -typing.cast(int, x["remaining_packages"]), |
| 873 | + typing.cast(str, x["collection"]), |
| 874 | + ) |
| 875 | + ) |
| 876 | + return result |
| 877 | + |
| 878 | + |
| 879 | +def _suggest_base_table( |
| 880 | + candidates: list[dict[str, typing.Any]], |
| 881 | + total_collections: int, |
| 882 | + collection_names: list[str], |
| 883 | + min_collections: int, |
| 884 | + base_packages: set[NormalizedName] | None, |
| 885 | + total_unique_packages: int, |
| 886 | + impact: list[dict[str, typing.Any]], |
| 887 | +) -> None: |
| 888 | + """Display suggest-base results as a rich table.""" |
| 889 | + title = ( |
| 890 | + f"Base collection candidates " |
| 891 | + f"(threshold: {min_collections}/{total_collections} collections)\n" |
| 892 | + f"Collections: {', '.join(sorted(collection_names))}" |
| 893 | + ) |
| 894 | + table = Table(title=title, box=rich.box.MARKDOWN, title_justify="left") |
| 895 | + table.add_column("Package", justify="left", no_wrap=True) |
| 896 | + table.add_column("Collections", justify="right", no_wrap=True) |
| 897 | + table.add_column("Coverage", justify="right", no_wrap=True) |
| 898 | + table.add_column("Appears In", justify="left") |
| 899 | + if base_packages is not None: |
| 900 | + table.add_column("In Base", justify="center", no_wrap=True) |
| 901 | + |
| 902 | + already_in_base = 0 |
| 903 | + new_candidates = 0 |
| 904 | + for entry in candidates: |
| 905 | + pkg = entry["package"] |
| 906 | + count = entry["count"] |
| 907 | + cols = entry["collections"] |
| 908 | + coverage = f"{(count / total_collections) * 100:.1f}%" |
| 909 | + count_str = f"{count}/{total_collections}" |
| 910 | + appears_in = ", ".join(cols) |
| 911 | + if base_packages is not None: |
| 912 | + in_base = pkg in base_packages |
| 913 | + if in_base: |
| 914 | + already_in_base += 1 |
| 915 | + else: |
| 916 | + new_candidates += 1 |
| 917 | + table.add_row( |
| 918 | + pkg, count_str, coverage, appears_in, "yes" if in_base else "no" |
| 919 | + ) |
| 920 | + else: |
| 921 | + new_candidates += 1 |
| 922 | + table.add_row(pkg, count_str, coverage, appears_in) |
| 923 | + |
| 924 | + console = rich.get_console() |
| 925 | + console.print(table) |
| 926 | + console.print(f"\nTotal unique packages: {total_unique_packages}") |
| 927 | + console.print(f"Packages in >= {min_collections} collections: {len(candidates)}") |
| 928 | + if base_packages is not None: |
| 929 | + console.print(f"Already in base: {already_in_base}") |
| 930 | + console.print(f"New candidates: {new_candidates}") |
| 931 | + |
| 932 | + # Collection Impact table |
| 933 | + impact_table = Table( |
| 934 | + title="Collection Impact", box=rich.box.MARKDOWN, title_justify="left" |
| 935 | + ) |
| 936 | + impact_table.add_column("Collection", justify="left", no_wrap=True) |
| 937 | + impact_table.add_column("Total Pkgs", justify="right", no_wrap=True) |
| 938 | + impact_table.add_column("In Base", justify="right", no_wrap=True) |
| 939 | + impact_table.add_column("Remaining", justify="right", no_wrap=True) |
| 940 | + impact_table.add_column("% Saved", justify="right", no_wrap=True) |
| 941 | + for entry in impact: |
| 942 | + impact_table.add_row( |
| 943 | + entry["collection"], |
| 944 | + str(entry["total_packages"]), |
| 945 | + str(entry["base_packages"]), |
| 946 | + str(entry["remaining_packages"]), |
| 947 | + f"{entry['reduction_percentage']:.1f}%", |
| 948 | + ) |
| 949 | + console.print(impact_table) |
| 950 | + |
| 951 | + # Remaining Packages table — deduplicated across all collections |
| 952 | + seen: set[NormalizedName] = set() |
| 953 | + remaining_rows: list[dict[str, typing.Any]] = [] |
| 954 | + for entry in impact: |
| 955 | + for pkg_entry in entry["remaining"]: |
| 956 | + pkg = pkg_entry["package"] |
| 957 | + if pkg not in seen: |
| 958 | + seen.add(pkg) |
| 959 | + remaining_rows.append(pkg_entry) |
| 960 | + remaining_rows.sort(key=lambda x: (-x["collection_count"], x["package"])) |
| 961 | + |
| 962 | + remaining_table = Table( |
| 963 | + title="Remaining Packages (not in proposed base)", |
| 964 | + box=rich.box.MARKDOWN, |
| 965 | + title_justify="left", |
| 966 | + ) |
| 967 | + remaining_table.add_column("Package", justify="left", no_wrap=True) |
| 968 | + remaining_table.add_column("Collections", justify="right", no_wrap=True) |
| 969 | + remaining_table.add_column("Coverage", justify="right", no_wrap=True) |
| 970 | + for pkg_entry in remaining_rows: |
| 971 | + count = pkg_entry["collection_count"] |
| 972 | + remaining_table.add_row( |
| 973 | + pkg_entry["package"], |
| 974 | + f"{count}/{total_collections}", |
| 975 | + f"{(count / total_collections) * 100:.1f}%", |
| 976 | + ) |
| 977 | + console.print(remaining_table) |
| 978 | + |
| 979 | + |
| 980 | +def _suggest_base_json( |
| 981 | + candidates: list[dict[str, typing.Any]], |
| 982 | + total_collections: int, |
| 983 | + collection_names: list[str], |
| 984 | + min_collections: int, |
| 985 | + base_packages: set[NormalizedName] | None, |
| 986 | + base_graph: str | None, |
| 987 | + total_unique_packages: int, |
| 988 | + impact: list[dict[str, typing.Any]], |
| 989 | +) -> None: |
| 990 | + """Display suggest-base results as JSON.""" |
| 991 | + output: dict[str, typing.Any] = { |
| 992 | + "metadata": { |
| 993 | + "total_collections": total_collections, |
| 994 | + "total_unique_packages": total_unique_packages, |
| 995 | + "packages_meeting_threshold": len(candidates), |
| 996 | + "collections": sorted(collection_names), |
| 997 | + "min_collections": min_collections, |
| 998 | + }, |
| 999 | + "candidates": [], |
| 1000 | + "collection_impact": impact, |
| 1001 | + } |
| 1002 | + if base_graph is not None: |
| 1003 | + output["metadata"]["base_graph"] = base_graph |
| 1004 | + |
| 1005 | + for entry in candidates: |
| 1006 | + pkg = entry["package"] |
| 1007 | + count = entry["count"] |
| 1008 | + cols = entry["collections"] |
| 1009 | + candidate: dict[str, typing.Any] = { |
| 1010 | + "package": pkg, |
| 1011 | + "collections": cols, |
| 1012 | + "collection_count": count, |
| 1013 | + "coverage_percentage": round((count / total_collections) * 100, 1), |
| 1014 | + } |
| 1015 | + if base_packages is not None: |
| 1016 | + candidate["in_base"] = pkg in base_packages |
| 1017 | + output["candidates"].append(candidate) |
| 1018 | + |
| 1019 | + json.dump(output, sys.stdout, indent=2) |
| 1020 | + |
| 1021 | + |
| 1022 | +def _suggest_base_impl( |
| 1023 | + collection_graphs: tuple[str, ...], |
| 1024 | + base_graph: str | None, |
| 1025 | + min_collections: int | None, |
| 1026 | + output_format: str, |
| 1027 | +) -> None: |
| 1028 | + """Core implementation for suggest_base, testable without a click context.""" |
| 1029 | + if len(collection_graphs) < 2: |
| 1030 | + raise click.UsageError("At least 2 collection graphs are required") |
| 1031 | + if min_collections is None: |
| 1032 | + min_collections = max(2, math.ceil(len(collection_graphs) / 2)) |
| 1033 | + elif min_collections < 2: |
| 1034 | + raise click.UsageError("--min-collections must be >= 2") |
| 1035 | + if min_collections > len(collection_graphs): |
| 1036 | + raise click.UsageError( |
| 1037 | + f"--min-collections ({min_collections}) cannot exceed number of graphs ({len(collection_graphs)})" |
| 1038 | + ) |
| 1039 | + |
| 1040 | + # Load each collection |
| 1041 | + collections: dict[str, set[NormalizedName]] = {} |
| 1042 | + for path in collection_graphs: |
| 1043 | + name = _get_collection_name(path) |
| 1044 | + pkgs = _get_collection_packages(path) |
| 1045 | + if not pkgs: |
| 1046 | + logger.warning("Collection %s is empty, skipping", name) |
| 1047 | + continue |
| 1048 | + collections[name] = pkgs |
| 1049 | + |
| 1050 | + # Load base graph if provided |
| 1051 | + base_packages: set[NormalizedName] | None = None |
| 1052 | + if base_graph: |
| 1053 | + base_packages = _get_collection_packages(base_graph) |
| 1054 | + |
| 1055 | + total_unique_packages = len(set().union(*collections.values())) |
| 1056 | + candidates = _find_shared_packages(collections, min_collections) |
| 1057 | + total = len(collections) |
| 1058 | + |
| 1059 | + base_package_names: set[NormalizedName] = {entry["package"] for entry in candidates} |
| 1060 | + impact = _compute_collection_impact(collections, base_package_names) |
| 1061 | + |
| 1062 | + if output_format == "json": |
| 1063 | + _suggest_base_json( |
| 1064 | + candidates, |
| 1065 | + total, |
| 1066 | + list(collections), |
| 1067 | + min_collections, |
| 1068 | + base_packages, |
| 1069 | + base_graph, |
| 1070 | + total_unique_packages, |
| 1071 | + impact, |
| 1072 | + ) |
| 1073 | + else: |
| 1074 | + _suggest_base_table( |
| 1075 | + candidates, |
| 1076 | + total, |
| 1077 | + list(collections), |
| 1078 | + min_collections, |
| 1079 | + base_packages, |
| 1080 | + total_unique_packages, |
| 1081 | + impact, |
| 1082 | + ) |
| 1083 | + |
| 1084 | + |
| 1085 | +@graph.command() |
| 1086 | +@click.option( |
| 1087 | + "--base", |
| 1088 | + "base_graph", |
| 1089 | + type=str, |
| 1090 | + default=None, |
| 1091 | + help="Existing base collection graph to enhance", |
| 1092 | +) |
| 1093 | +@click.option( |
| 1094 | + "--min-collections", |
| 1095 | + type=int, |
| 1096 | + default=None, |
| 1097 | + help="Minimum collections a package must appear in (default: 50% of provided collections)", |
| 1098 | +) |
| 1099 | +@click.option( |
| 1100 | + "--format", |
| 1101 | + "output_format", |
| 1102 | + type=click.Choice(["table", "json"]), |
| 1103 | + default="table", |
| 1104 | + help="Output format (default: table)", |
| 1105 | +) |
| 1106 | +@click.argument("collection_graphs", nargs=-1, required=True) |
| 1107 | +@click.pass_obj |
| 1108 | +def suggest_base( |
| 1109 | + wkctx: context.WorkContext, |
| 1110 | + collection_graphs: tuple[str, ...], |
| 1111 | + base_graph: str | None, |
| 1112 | + min_collections: int | None, |
| 1113 | + output_format: str, |
| 1114 | +) -> None: |
| 1115 | + """Suggest packages for a shared base collection. |
| 1116 | +
|
| 1117 | + Analyzes COLLECTION_GRAPHS (2 or more graph files) to identify packages |
| 1118 | + appearing across multiple collections. These are candidates for factoring |
| 1119 | + into a base collection built once and reused. |
| 1120 | + """ |
| 1121 | + _suggest_base_impl(collection_graphs, base_graph, min_collections, output_format) |
0 commit comments