09362bc3e8
On very large graphs (14k+ paths), we'd end up with a massive in memory tree of mostly duplication. We can safely cache trees and point back to them later, saving memory.
568 lines
16 KiB
Python
568 lines
16 KiB
Python
# IMPORTANT: Making changes?
|
|
#
|
|
# Validate your changes with python3 ./closure-graph.py --test
|
|
|
|
|
|
# Using a simple algorithm, convert the references to a path in to a
|
|
# sorted list of dependent paths based on how often they're referenced
|
|
# and how deep in the tree they live. Equally-"popular" paths are then
|
|
# sorted by name.
|
|
#
|
|
# The existing writeReferencesToFile prints the paths in a simple
|
|
# ascii-based sorting of the paths.
|
|
#
|
|
# Sorting the paths by graph improves the chances that the difference
|
|
# between two builds appear near the end of the list, instead of near
|
|
# the beginning. This makes a difference for Nix builds which export a
|
|
# closure for another program to consume, if that program implements its
|
|
# own level of binary diffing.
|
|
#
|
|
# For an example, Docker Images. If each store path is a separate layer
|
|
# then Docker Images can be very efficiently transfered between systems,
|
|
# and we get very good cache reuse between images built with the same
|
|
# version of Nixpkgs. However, since Docker only reliably supports a
|
|
# small number of layers (42) it is important to pick the individual
|
|
# layers carefully. By storing very popular store paths in the first 40
|
|
# layers, we improve the chances that the next Docker image will share
|
|
# many of those layers.*
|
|
#
|
|
# Given the dependency tree:
|
|
#
|
|
# A - B - C - D -\
|
|
# \ \ \ \
|
|
# \ \ \ \
|
|
# \ \ - E ---- F
|
|
# \- G
|
|
#
|
|
# Nodes which have multiple references are duplicated:
|
|
#
|
|
# A - B - C - D - F
|
|
# \ \ \
|
|
# \ \ \- E - F
|
|
# \ \
|
|
# \ \- E - F
|
|
# \
|
|
# \- G
|
|
#
|
|
# Each leaf node is now replaced by a counter defaulted to 1:
|
|
#
|
|
# A - B - C - D - (F:1)
|
|
# \ \ \
|
|
# \ \ \- E - (F:1)
|
|
# \ \
|
|
# \ \- E - (F:1)
|
|
# \
|
|
# \- (G:1)
|
|
#
|
|
# Then each leaf counter is merged with its parent node, replacing the
|
|
# parent node with a counter of 1, and each existing counter being
|
|
# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
|
|
#
|
|
# A - B - C - (D:1, F:2)
|
|
# \ \ \
|
|
# \ \ \- (E:1, F:2)
|
|
# \ \
|
|
# \ \- (E:1, F:2)
|
|
# \
|
|
# \- (G:1)
|
|
#
|
|
# Then each leaf counter is merged with its parent node again, merging
|
|
# any counters, then incrementing each:
|
|
#
|
|
# A - B - (C:1, D:2, E:2, F:5)
|
|
# \ \
|
|
# \ \- (E:1, F:2)
|
|
# \
|
|
# \- (G:1)
|
|
#
|
|
# And again:
|
|
#
|
|
# A - (B:1, C:2, D:3, E:4, F:8)
|
|
# \
|
|
# \- (G:1)
|
|
#
|
|
# And again:
|
|
#
|
|
# (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
|
|
#
|
|
# and then paths have the following "popularity":
|
|
#
|
|
# A 1
|
|
# B 2
|
|
# C 3
|
|
# D 4
|
|
# E 5
|
|
# F 9
|
|
# G 2
|
|
#
|
|
# and the popularity contest would result in the paths being printed as:
|
|
#
|
|
# F
|
|
# E
|
|
# D
|
|
# C
|
|
# B
|
|
# G
|
|
# A
|
|
#
|
|
# * Note: People who have used a Dockerfile before assume Docker's
|
|
# Layers are inherently ordered. However, this is not true -- Docker
|
|
# layers are content-addressable and are not explicitly layered until
|
|
# they are composed in to an Image.
|
|
|
|
import sys
|
|
import json
|
|
import unittest
|
|
|
|
from pprint import pprint
|
|
from collections import defaultdict
|
|
|
|
|
|
def debug(msg, *args, **kwargs):
|
|
if False:
|
|
print(
|
|
"DEBUG: {}".format(
|
|
msg.format(*args, **kwargs)
|
|
),
|
|
file=sys.stderr
|
|
)
|
|
|
|
|
|
# Find paths in the original dataset which are never referenced by
|
|
# any other paths
|
|
def find_roots(closures):
|
|
roots = [];
|
|
|
|
for closure in closures:
|
|
path = closure['path']
|
|
if not any_refer_to(path, closures):
|
|
roots.append(path)
|
|
|
|
return roots
|
|
|
|
class TestFindRoots(unittest.TestCase):
|
|
def test_find_roots(self):
|
|
self.assertCountEqual(
|
|
find_roots([
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/foo",
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/bar",
|
|
"references": [
|
|
"/nix/store/bar",
|
|
"/nix/store/tux"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/hello",
|
|
"references": [
|
|
]
|
|
}
|
|
]),
|
|
["/nix/store/foo", "/nix/store/hello"]
|
|
)
|
|
|
|
|
|
def any_refer_to(path, closures):
|
|
for closure in closures:
|
|
if path != closure['path']:
|
|
if path in closure['references']:
|
|
return True
|
|
return False
|
|
|
|
class TestAnyReferTo(unittest.TestCase):
|
|
def test_has_references(self):
|
|
self.assertTrue(
|
|
any_refer_to(
|
|
"/nix/store/bar",
|
|
[
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
]
|
|
),
|
|
)
|
|
def test_no_references(self):
|
|
self.assertFalse(
|
|
any_refer_to(
|
|
"/nix/store/foo",
|
|
[
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/foo",
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
]
|
|
),
|
|
)
|
|
|
|
def all_paths(closures):
|
|
paths = []
|
|
for closure in closures:
|
|
paths.append(closure['path'])
|
|
paths.extend(closure['references'])
|
|
paths.sort()
|
|
return list(set(paths))
|
|
|
|
|
|
class TestAllPaths(unittest.TestCase):
|
|
def test_returns_all_paths(self):
|
|
self.assertCountEqual(
|
|
all_paths([
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/foo",
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/bar",
|
|
"references": [
|
|
"/nix/store/bar",
|
|
"/nix/store/tux"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/hello",
|
|
"references": [
|
|
]
|
|
}
|
|
]),
|
|
["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
|
|
)
|
|
def test_no_references(self):
|
|
self.assertFalse(
|
|
any_refer_to(
|
|
"/nix/store/foo",
|
|
[
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/foo",
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
]
|
|
),
|
|
)
|
|
|
|
# Convert:
|
|
#
|
|
# [
|
|
# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
|
|
# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
|
|
# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
|
|
# { path: /nix/store/tux, references: [ /nix/store/tux ] }
|
|
# ]
|
|
#
|
|
# To:
|
|
# {
|
|
# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
|
|
# /nix/store/bar: [ /nix/store/baz ],
|
|
# /nix/store/baz: [ /nix/store/tux ] },
|
|
# /nix/store/tux: [ ]
|
|
# }
|
|
#
|
|
# Note that it drops self-references to avoid loops.
|
|
def make_lookup(closures):
|
|
lookup = {}
|
|
|
|
for closure in closures:
|
|
# paths often self-refer
|
|
nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
|
|
lookup[closure['path']] = nonreferential_paths
|
|
|
|
return lookup
|
|
|
|
class TestMakeLookup(unittest.TestCase):
|
|
def test_returns_lookp(self):
|
|
self.assertDictEqual(
|
|
make_lookup([
|
|
{
|
|
"path": "/nix/store/foo",
|
|
"references": [
|
|
"/nix/store/foo",
|
|
"/nix/store/bar"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/bar",
|
|
"references": [
|
|
"/nix/store/bar",
|
|
"/nix/store/tux"
|
|
]
|
|
},
|
|
{
|
|
"path": "/nix/store/hello",
|
|
"references": [
|
|
]
|
|
}
|
|
]),
|
|
{
|
|
"/nix/store/foo": [ "/nix/store/bar" ],
|
|
"/nix/store/bar": [ "/nix/store/tux" ],
|
|
"/nix/store/hello": [ ],
|
|
}
|
|
)
|
|
|
|
# Convert:
|
|
#
|
|
# /nix/store/foo with
|
|
# {
|
|
# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
|
|
# /nix/store/bar: [ /nix/store/baz ],
|
|
# /nix/store/baz: [ /nix/store/tux ] },
|
|
# /nix/store/tux: [ ]
|
|
# }
|
|
#
|
|
# To:
|
|
#
|
|
# {
|
|
# /nix/store/bar: {
|
|
# /nix/store/baz: {
|
|
# /nix/store/tux: {}
|
|
# }
|
|
# },
|
|
# /nix/store/baz: {
|
|
# /nix/store/tux: {}
|
|
# }
|
|
# }
|
|
subgraphs_cache = {}
|
|
def make_graph_segment_from_root(root, lookup):
|
|
global subgraphs_cache
|
|
children = {}
|
|
for ref in lookup[root]:
|
|
# make_graph_segment_from_root is a pure function, and will
|
|
# always return the same result based on a given input. Thus,
|
|
# cache computation.
|
|
#
|
|
# Python's assignment will use a pointer, preventing memory
|
|
# bloat for large graphs.
|
|
if ref not in subgraphs_cache:
|
|
debug("Subgraph Cache miss on {}".format(ref))
|
|
subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
|
|
else:
|
|
debug("Subgraph Cache hit on {}".format(ref))
|
|
children[ref] = subgraphs_cache[ref]
|
|
return children
|
|
|
|
class TestMakeGraphSegmentFromRoot(unittest.TestCase):
|
|
def test_returns_graph(self):
|
|
self.assertDictEqual(
|
|
make_graph_segment_from_root("/nix/store/foo", {
|
|
"/nix/store/foo": [ "/nix/store/bar" ],
|
|
"/nix/store/bar": [ "/nix/store/tux" ],
|
|
"/nix/store/tux": [ ],
|
|
"/nix/store/hello": [ ],
|
|
}),
|
|
{
|
|
"/nix/store/bar": {
|
|
"/nix/store/tux": {}
|
|
}
|
|
}
|
|
)
|
|
def test_returns_graph_tiny(self):
|
|
self.assertDictEqual(
|
|
make_graph_segment_from_root("/nix/store/tux", {
|
|
"/nix/store/foo": [ "/nix/store/bar" ],
|
|
"/nix/store/bar": [ "/nix/store/tux" ],
|
|
"/nix/store/tux": [ ],
|
|
}),
|
|
{}
|
|
)
|
|
|
|
# Convert a graph segment in to a popularity-counted dictionary:
|
|
#
|
|
# From:
|
|
# {
|
|
# /nix/store/foo: {
|
|
# /nix/store/bar: {
|
|
# /nix/store/baz: {
|
|
# /nix/store/tux: {}
|
|
# }
|
|
# }
|
|
# /nix/store/baz: {
|
|
# /nix/store/tux: {}
|
|
# }
|
|
# }
|
|
# }
|
|
#
|
|
# to:
|
|
# [
|
|
# /nix/store/foo: 1
|
|
# /nix/store/bar: 2
|
|
# /nix/store/baz: 4
|
|
# /nix/store/tux: 6
|
|
# ]
|
|
popularity_cache = {}
|
|
def graph_popularity_contest(full_graph):
|
|
global popularity_cache
|
|
popularity = defaultdict(int)
|
|
for path, subgraph in full_graph.items():
|
|
popularity[path] += 1
|
|
# graph_popularity_contest is a pure function, and will
|
|
# always return the same result based on a given input. Thus,
|
|
# cache computation.
|
|
#
|
|
# Python's assignment will use a pointer, preventing memory
|
|
# bloat for large graphs.
|
|
if path not in popularity_cache:
|
|
debug("Popularity Cache miss on {}", path)
|
|
popularity_cache[path] = graph_popularity_contest(subgraph)
|
|
else:
|
|
debug("Popularity Cache hit on {}", path)
|
|
|
|
subcontest = popularity_cache[path]
|
|
for subpath, subpopularity in subcontest.items():
|
|
debug("Calculating popularity for {}", subpath)
|
|
popularity[subpath] += subpopularity + 1
|
|
|
|
return popularity
|
|
|
|
class TestGraphPopularityContest(unittest.TestCase):
|
|
def test_counts_popularity(self):
|
|
self.assertDictEqual(
|
|
graph_popularity_contest({
|
|
"/nix/store/foo": {
|
|
"/nix/store/bar": {
|
|
"/nix/store/baz": {
|
|
"/nix/store/tux": {}
|
|
}
|
|
},
|
|
"/nix/store/baz": {
|
|
"/nix/store/tux": {}
|
|
}
|
|
}
|
|
}),
|
|
{
|
|
"/nix/store/foo": 1,
|
|
"/nix/store/bar": 2,
|
|
"/nix/store/baz": 4,
|
|
"/nix/store/tux": 6,
|
|
}
|
|
)
|
|
|
|
# Emit a list of packages by popularity, most first:
|
|
#
|
|
# From:
|
|
# [
|
|
# /nix/store/foo: 1
|
|
# /nix/store/bar: 1
|
|
# /nix/store/baz: 2
|
|
# /nix/store/tux: 2
|
|
# ]
|
|
#
|
|
# To:
|
|
# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
|
|
def order_by_popularity(paths):
|
|
paths_by_popularity = defaultdict(list)
|
|
popularities = []
|
|
for path, popularity in paths.items():
|
|
popularities.append(popularity)
|
|
paths_by_popularity[popularity].append(path)
|
|
|
|
popularities = list(set(popularities))
|
|
popularities.sort()
|
|
|
|
flat_ordered = []
|
|
for popularity in popularities:
|
|
paths = paths_by_popularity[popularity]
|
|
paths.sort(key=package_name)
|
|
|
|
flat_ordered.extend(reversed(paths))
|
|
return list(reversed(flat_ordered))
|
|
|
|
|
|
class TestOrderByPopularity(unittest.TestCase):
|
|
def test_returns_in_order(self):
|
|
self.assertEqual(
|
|
order_by_popularity({
|
|
"/nix/store/foo": 1,
|
|
"/nix/store/bar": 1,
|
|
"/nix/store/baz": 2,
|
|
"/nix/store/tux": 2,
|
|
}),
|
|
[
|
|
"/nix/store/baz",
|
|
"/nix/store/tux",
|
|
"/nix/store/bar",
|
|
"/nix/store/foo"
|
|
]
|
|
)
|
|
|
|
def package_name(path):
|
|
parts = path.split('-')
|
|
start = parts.pop(0)
|
|
# don't throw away any data, so the order is always the same.
|
|
# even in cases where only the hash at the start has changed.
|
|
parts.append(start)
|
|
return '-'.join(parts)
|
|
|
|
def main():
|
|
filename = sys.argv[1]
|
|
key = sys.argv[2]
|
|
|
|
debug("Loading from {}", filename)
|
|
with open(filename) as f:
|
|
data = json.load(f)
|
|
|
|
# Data comes in as:
|
|
# [
|
|
# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
|
|
# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
|
|
# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
|
|
# { path: /nix/store/tux, references: [ /nix/store/tux ] }
|
|
# ]
|
|
#
|
|
# and we want to get out a list of paths ordered by how universally,
|
|
# important they are, ie: tux is referenced by every path, transitively
|
|
# so it should be #1
|
|
#
|
|
# [
|
|
# /nix/store/tux,
|
|
# /nix/store/baz,
|
|
# /nix/store/bar,
|
|
# /nix/store/foo,
|
|
# ]
|
|
graph = data[key]
|
|
|
|
debug("Finding roots from {}", key)
|
|
roots = find_roots(graph);
|
|
debug("Making lookup for {}", key)
|
|
lookup = make_lookup(graph)
|
|
|
|
full_graph = {}
|
|
for root in roots:
|
|
debug("Making full graph for {}", root)
|
|
full_graph[root] = make_graph_segment_from_root(root, lookup)
|
|
|
|
debug("Running contest")
|
|
contest = graph_popularity_contest(full_graph)
|
|
debug("Ordering by popularity")
|
|
ordered = order_by_popularity(contest)
|
|
debug("Checking for missing paths")
|
|
missing = []
|
|
for path in all_paths(graph):
|
|
if path not in ordered:
|
|
missing.append(path)
|
|
|
|
ordered.extend(missing)
|
|
print("\n".join(ordered))
|
|
|
|
if "--test" in sys.argv:
|
|
# Don't pass --test otherwise unittest gets mad
|
|
unittest.main(argv = [f for f in sys.argv if f != "--test" ])
|
|
else:
|
|
main()
|