references-by-popularity: cache computation to avoid memory bloat

On very large graphs (14k+ paths), we'd end up with a massive in
memory tree of mostly duplication.

We can safely cache trees and point back to them later, saving
memory.
This commit is contained in:
Graham Christensen 2019-03-05 16:37:49 -05:00
parent 54826e7471
commit 09362bc3e8
No known key found for this signature in database
GPG Key ID: ACA1C1D120C83D5C

View File

@ -338,11 +338,23 @@ class TestMakeLookup(unittest.TestCase):
# /nix/store/tux: {} # /nix/store/tux: {}
# } # }
# } # }
subgraphs_cache = {}
def make_graph_segment_from_root(root, lookup): def make_graph_segment_from_root(root, lookup):
global subgraphs_cache
children = {} children = {}
for ref in lookup[root]: for ref in lookup[root]:
debug("Making graph segments on {}".format(ref)) # make_graph_segment_from_root is a pure function, and will
children[ref] = make_graph_segment_from_root(ref, lookup) # always return the same result based on a given input. Thus,
# cache computation.
#
# Python's assignment will use a pointer, preventing memory
# bloat for large graphs.
if ref not in subgraphs_cache:
debug("Subgraph Cache miss on {}".format(ref))
subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
else:
debug("Subgraph Cache hit on {}".format(ref))
children[ref] = subgraphs_cache[ref]
return children return children
class TestMakeGraphSegmentFromRoot(unittest.TestCase): class TestMakeGraphSegmentFromRoot(unittest.TestCase):
@ -393,13 +405,27 @@ class TestMakeGraphSegmentFromRoot(unittest.TestCase):
# /nix/store/baz: 4 # /nix/store/baz: 4
# /nix/store/tux: 6 # /nix/store/tux: 6
# ] # ]
popularity_cache = {}
def graph_popularity_contest(full_graph): def graph_popularity_contest(full_graph):
global popularity_cache
popularity = defaultdict(int) popularity = defaultdict(int)
for path, subgraph in full_graph.items(): for path, subgraph in full_graph.items():
debug("Calculating popularity under {}".format(path))
popularity[path] += 1 popularity[path] += 1
subcontest = graph_popularity_contest(subgraph) # graph_popularity_contest is a pure function, and will
# always return the same result based on a given input. Thus,
# cache computation.
#
# Python's assignment will use a pointer, preventing memory
# bloat for large graphs.
if path not in popularity_cache:
debug("Popularity Cache miss on {}", path)
popularity_cache[path] = graph_popularity_contest(subgraph)
else:
debug("Popularity Cache hit on {}", path)
subcontest = popularity_cache[path]
for subpath, subpopularity in subcontest.items(): for subpath, subpopularity in subcontest.items():
debug("Calculating popularity for {}", subpath)
popularity[subpath] += subpopularity + 1 popularity[subpath] += subpopularity + 1
return popularity return popularity