2020-06-18 03:29:59 +01:00
|
|
|
"""
|
|
|
|
This script generates a Docker image from a set of store paths. Uses
|
|
|
|
Docker Image Specification v1.2 as reference [1].
|
|
|
|
|
|
|
|
It expects a JSON file with the following properties and writes the
|
|
|
|
image as an uncompressed tarball to stdout:
|
|
|
|
|
|
|
|
* "architecture", "config", "os", "created", "repo_tag" correspond to
|
|
|
|
the fields with the same name on the image spec [2].
|
|
|
|
* "created" can be "now".
|
|
|
|
* "created" is also used as mtime for files added to the image.
|
|
|
|
* "store_layers" is a list of layers in ascending order, where each
|
|
|
|
layer is the list of store paths to include in that layer.
|
|
|
|
|
|
|
|
The main challenge for this script to create the final image in a
|
|
|
|
streaming fashion, without dumping any intermediate data to disk
|
|
|
|
for performance.
|
|
|
|
|
|
|
|
A docker image has each layer contents archived as separate tarballs,
|
|
|
|
and they later all get enveloped into a single big tarball in a
|
|
|
|
content addressed fashion. However, because how "tar" format works,
|
|
|
|
we have to know about the name (which includes the checksum in our
|
|
|
|
case) and the size of the tarball before we can start adding it to the
|
|
|
|
outer tarball. We achieve that by creating the layer tarballs twice;
|
|
|
|
on the first iteration we calculate the file size and the checksum,
|
|
|
|
and on the second one we actually stream the contents. 'add_layer_dir'
|
|
|
|
function does all this.
|
|
|
|
|
|
|
|
[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
|
|
|
|
[2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions
|
|
|
|
""" # noqa: E501
|
|
|
|
|
|
|
|
|
2020-06-08 10:47:46 +01:00
|
|
|
import io
|
|
|
|
import os
|
2021-03-08 20:36:13 +00:00
|
|
|
import re
|
2020-06-08 10:47:46 +01:00
|
|
|
import sys
|
|
|
|
import json
|
|
|
|
import hashlib
|
2020-06-22 04:11:04 +01:00
|
|
|
import pathlib
|
2020-06-08 10:47:46 +01:00
|
|
|
import tarfile
|
2020-07-04 11:00:57 +01:00
|
|
|
import itertools
|
2020-06-08 10:47:46 +01:00
|
|
|
import threading
|
2020-07-09 08:34:18 +01:00
|
|
|
from datetime import datetime, timezone
|
2020-06-08 10:47:46 +01:00
|
|
|
from collections import namedtuple
|
|
|
|
|
|
|
|
|
2020-08-14 10:06:00 +01:00
|
|
|
def archive_paths_to(obj, paths, mtime):
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
Writes the given store paths as a tar file to the given stream.
|
|
|
|
|
|
|
|
obj: Stream to write to. Should have a 'write' method.
|
|
|
|
paths: List of store paths.
|
|
|
|
"""
|
|
|
|
|
2020-06-08 10:47:46 +01:00
|
|
|
# gettarinfo makes the paths relative, this makes them
|
|
|
|
# absolute again
|
|
|
|
def append_root(ti):
|
|
|
|
ti.name = "/" + ti.name
|
|
|
|
return ti
|
|
|
|
|
2020-06-11 01:44:04 +01:00
|
|
|
def apply_filters(ti):
|
2020-06-11 01:51:47 +01:00
|
|
|
ti.mtime = mtime
|
2020-06-21 01:23:55 +01:00
|
|
|
ti.uid = 0
|
|
|
|
ti.gid = 0
|
|
|
|
ti.uname = "root"
|
|
|
|
ti.gname = "root"
|
2020-08-14 10:06:00 +01:00
|
|
|
return ti
|
2020-06-11 01:44:04 +01:00
|
|
|
|
2020-07-30 15:20:50 +01:00
|
|
|
def nix_root(ti):
|
|
|
|
ti.mode = 0o0555 # r-xr-xr-x
|
|
|
|
return ti
|
|
|
|
|
2020-06-08 10:47:46 +01:00
|
|
|
def dir(path):
|
|
|
|
ti = tarfile.TarInfo(path)
|
|
|
|
ti.type = tarfile.DIRTYPE
|
|
|
|
return ti
|
|
|
|
|
|
|
|
with tarfile.open(fileobj=obj, mode="w|") as tar:
|
2020-06-18 03:29:59 +01:00
|
|
|
# To be consistent with the docker utilities, we need to have
|
2020-08-14 10:06:00 +01:00
|
|
|
# these directories first when building layer tarballs.
|
|
|
|
tar.addfile(apply_filters(nix_root(dir("/nix"))))
|
|
|
|
tar.addfile(apply_filters(nix_root(dir("/nix/store"))))
|
2020-06-08 10:47:46 +01:00
|
|
|
|
|
|
|
for path in paths:
|
2020-07-04 11:00:57 +01:00
|
|
|
path = pathlib.Path(path)
|
2021-01-04 18:39:21 +00:00
|
|
|
if path.is_symlink():
|
|
|
|
files = [path]
|
|
|
|
else:
|
|
|
|
files = itertools.chain([path], path.rglob("*"))
|
|
|
|
|
2020-07-04 11:00:57 +01:00
|
|
|
for filename in sorted(files):
|
2020-06-22 04:11:04 +01:00
|
|
|
ti = append_root(tar.gettarinfo(filename))
|
|
|
|
|
|
|
|
# copy hardlinks as regular files
|
|
|
|
if ti.islnk():
|
|
|
|
ti.type = tarfile.REGTYPE
|
|
|
|
ti.linkname = ""
|
2020-07-06 05:42:03 +01:00
|
|
|
ti.size = filename.stat().st_size
|
2020-06-22 04:11:04 +01:00
|
|
|
|
|
|
|
ti = apply_filters(ti)
|
|
|
|
if ti.isfile():
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
tar.addfile(ti, f)
|
|
|
|
else:
|
|
|
|
tar.addfile(ti)
|
2020-06-08 10:47:46 +01:00
|
|
|
|
|
|
|
|
|
|
|
class ExtractChecksum:
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
A writable stream which only calculates the final file size and
|
|
|
|
sha256sum, while discarding the actual contents.
|
|
|
|
"""
|
|
|
|
|
2020-06-08 10:47:46 +01:00
|
|
|
def __init__(self):
|
|
|
|
self._digest = hashlib.sha256()
|
|
|
|
self._size = 0
|
|
|
|
|
|
|
|
def write(self, data):
|
|
|
|
self._digest.update(data)
|
|
|
|
self._size += len(data)
|
|
|
|
|
|
|
|
def extract(self):
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
Returns: Hex-encoded sha256sum and size as a tuple.
|
|
|
|
"""
|
2020-06-08 10:47:46 +01:00
|
|
|
return (self._digest.hexdigest(), self._size)
|
|
|
|
|
|
|
|
|
2021-03-08 20:36:13 +00:00
|
|
|
FromImage = namedtuple("FromImage", ["tar", "manifest_json", "image_json"])
|
2020-06-08 10:47:46 +01:00
|
|
|
# Some metadata for a layer
|
|
|
|
LayerInfo = namedtuple("LayerInfo", ["size", "checksum", "path", "paths"])
|
|
|
|
|
|
|
|
|
2021-03-08 20:36:13 +00:00
|
|
|
def load_from_image(from_image_str):
|
|
|
|
"""
|
|
|
|
Loads the given base image, if any.
|
|
|
|
|
|
|
|
from_image_str: Path to the base image archive.
|
|
|
|
|
|
|
|
Returns: A 'FromImage' object with references to the loaded base image,
|
|
|
|
or 'None' if no base image was provided.
|
|
|
|
"""
|
|
|
|
if from_image_str is None:
|
|
|
|
return None
|
|
|
|
|
|
|
|
base_tar = tarfile.open(from_image_str)
|
|
|
|
|
|
|
|
manifest_json_tarinfo = base_tar.getmember("manifest.json")
|
|
|
|
with base_tar.extractfile(manifest_json_tarinfo) as f:
|
|
|
|
manifest_json = json.load(f)
|
|
|
|
|
|
|
|
image_json_tarinfo = base_tar.getmember(manifest_json[0]["Config"])
|
|
|
|
with base_tar.extractfile(image_json_tarinfo) as f:
|
|
|
|
image_json = json.load(f)
|
|
|
|
|
|
|
|
return FromImage(base_tar, manifest_json, image_json)
|
|
|
|
|
|
|
|
|
|
|
|
def add_base_layers(tar, from_image):
|
|
|
|
"""
|
|
|
|
Adds the layers from the given base image to the final image.
|
|
|
|
|
|
|
|
tar: 'tarfile.TarFile' object for new layers to be added to.
|
|
|
|
from_image: 'FromImage' object with references to the loaded base image.
|
|
|
|
"""
|
|
|
|
if from_image is None:
|
|
|
|
print("No 'fromImage' provided", file=sys.stderr)
|
|
|
|
return []
|
|
|
|
|
|
|
|
layers = from_image.manifest_json[0]["Layers"]
|
|
|
|
checksums = from_image.image_json["rootfs"]["diff_ids"]
|
|
|
|
layers_checksums = zip(layers, checksums)
|
|
|
|
|
|
|
|
for num, (layer, checksum) in enumerate(layers_checksums, start=1):
|
|
|
|
layer_tarinfo = from_image.tar.getmember(layer)
|
|
|
|
checksum = re.sub(r"^sha256:", "", checksum)
|
|
|
|
|
|
|
|
tar.addfile(layer_tarinfo, from_image.tar.extractfile(layer_tarinfo))
|
|
|
|
path = layer_tarinfo.path
|
|
|
|
size = layer_tarinfo.size
|
|
|
|
|
|
|
|
print("Adding base layer", num, "from", path, file=sys.stderr)
|
|
|
|
yield LayerInfo(size=size, checksum=checksum, path=path, paths=[path])
|
|
|
|
|
|
|
|
from_image.tar.close()
|
|
|
|
|
|
|
|
|
|
|
|
def overlay_base_config(from_image, final_config):
|
|
|
|
"""
|
|
|
|
Overlays the final image 'config' JSON on top of selected defaults from the
|
|
|
|
base image 'config' JSON.
|
|
|
|
|
|
|
|
from_image: 'FromImage' object with references to the loaded base image.
|
|
|
|
final_config: 'dict' object of the final image 'config' JSON.
|
|
|
|
"""
|
|
|
|
if from_image is None:
|
|
|
|
return final_config
|
|
|
|
|
|
|
|
base_config = from_image.image_json["config"]
|
|
|
|
|
|
|
|
# Preserve environment from base image
|
|
|
|
final_env = base_config.get("Env", []) + final_config.get("Env", [])
|
|
|
|
if final_env:
|
2021-03-25 16:38:37 +00:00
|
|
|
# Resolve duplicates (last one wins) and format back as list
|
|
|
|
resolved_env = {entry.split("=", 1)[0]: entry for entry in final_env}
|
|
|
|
final_config["Env"] = list(resolved_env.values())
|
2021-03-08 20:36:13 +00:00
|
|
|
return final_config
|
|
|
|
|
|
|
|
|
2021-03-08 11:24:29 +00:00
|
|
|
def add_layer_dir(tar, paths, store_dir, mtime):
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
Appends given store paths to a TarFile object as a new layer.
|
|
|
|
|
|
|
|
tar: 'tarfile.TarFile' object for the new layer to be added to.
|
|
|
|
paths: List of store paths.
|
2021-03-08 11:24:29 +00:00
|
|
|
store_dir: the root directory of the nix store
|
2020-06-21 01:09:22 +01:00
|
|
|
mtime: 'mtime' of the added files and the layer tarball.
|
|
|
|
Should be an integer representing a POSIX time.
|
|
|
|
|
|
|
|
Returns: A 'LayerInfo' object containing some metadata of
|
|
|
|
the layer added.
|
|
|
|
"""
|
|
|
|
|
2021-03-08 11:24:29 +00:00
|
|
|
invalid_paths = [i for i in paths if not i.startswith(store_dir)]
|
2020-06-21 01:18:28 +01:00
|
|
|
assert len(invalid_paths) == 0, \
|
2021-03-08 11:24:29 +00:00
|
|
|
f"Expecting absolute paths from {store_dir}, but got: {invalid_paths}"
|
2020-06-08 10:47:46 +01:00
|
|
|
|
2020-06-18 03:29:59 +01:00
|
|
|
# First, calculate the tarball checksum and the size.
|
2020-06-08 10:47:46 +01:00
|
|
|
extract_checksum = ExtractChecksum()
|
2020-06-11 01:44:04 +01:00
|
|
|
archive_paths_to(
|
|
|
|
extract_checksum,
|
|
|
|
paths,
|
2020-06-11 01:51:47 +01:00
|
|
|
mtime=mtime,
|
2020-06-11 01:44:04 +01:00
|
|
|
)
|
2020-06-08 10:47:46 +01:00
|
|
|
(checksum, size) = extract_checksum.extract()
|
|
|
|
|
|
|
|
path = f"{checksum}/layer.tar"
|
2020-06-21 01:14:52 +01:00
|
|
|
layer_tarinfo = tarfile.TarInfo(path)
|
|
|
|
layer_tarinfo.size = size
|
|
|
|
layer_tarinfo.mtime = mtime
|
2020-06-08 10:47:46 +01:00
|
|
|
|
2020-06-18 03:29:59 +01:00
|
|
|
# Then actually stream the contents to the outer tarball.
|
2020-06-08 10:47:46 +01:00
|
|
|
read_fd, write_fd = os.pipe()
|
|
|
|
with open(read_fd, "rb") as read, open(write_fd, "wb") as write:
|
|
|
|
def producer():
|
2020-06-11 01:44:04 +01:00
|
|
|
archive_paths_to(
|
|
|
|
write,
|
|
|
|
paths,
|
2020-06-11 01:51:47 +01:00
|
|
|
mtime=mtime,
|
2020-06-11 01:44:04 +01:00
|
|
|
)
|
2020-06-08 10:47:46 +01:00
|
|
|
write.close()
|
2020-06-18 03:29:59 +01:00
|
|
|
|
|
|
|
# Closing the write end of the fifo also closes the read end,
|
|
|
|
# so we don't need to wait until this thread is finished.
|
|
|
|
#
|
|
|
|
# Any exception from the thread will get printed by the default
|
|
|
|
# exception handler, and the 'addfile' call will fail since it
|
|
|
|
# won't be able to read required amount of bytes.
|
2020-06-08 10:47:46 +01:00
|
|
|
threading.Thread(target=producer).start()
|
2020-06-21 01:14:52 +01:00
|
|
|
tar.addfile(layer_tarinfo, read)
|
2020-06-08 10:47:46 +01:00
|
|
|
|
|
|
|
return LayerInfo(size=size, checksum=checksum, path=path, paths=paths)
|
|
|
|
|
|
|
|
|
2020-08-14 10:06:00 +01:00
|
|
|
def add_customisation_layer(target_tar, customisation_layer, mtime):
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
2020-08-14 10:06:00 +01:00
|
|
|
Adds the customisation layer as a new layer. This is layer is structured
|
|
|
|
differently; given store path has the 'layer.tar' and corresponding
|
|
|
|
sha256sum ready.
|
2020-06-21 01:09:22 +01:00
|
|
|
|
|
|
|
tar: 'tarfile.TarFile' object for the new layer to be added to.
|
2020-08-14 10:06:00 +01:00
|
|
|
customisation_layer: Path containing the layer archive.
|
|
|
|
mtime: 'mtime' of the added layer tarball.
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
|
2020-08-14 10:06:00 +01:00
|
|
|
checksum_path = os.path.join(customisation_layer, "checksum")
|
|
|
|
with open(checksum_path) as f:
|
|
|
|
checksum = f.read().strip()
|
|
|
|
assert len(checksum) == 64, f"Invalid sha256 at ${checksum_path}."
|
|
|
|
|
|
|
|
layer_path = os.path.join(customisation_layer, "layer.tar")
|
|
|
|
|
|
|
|
path = f"{checksum}/layer.tar"
|
|
|
|
tarinfo = target_tar.gettarinfo(layer_path)
|
|
|
|
tarinfo.name = path
|
|
|
|
tarinfo.mtime = mtime
|
|
|
|
|
|
|
|
with open(layer_path, "rb") as f:
|
|
|
|
target_tar.addfile(tarinfo, f)
|
|
|
|
|
|
|
|
return LayerInfo(
|
|
|
|
size=None,
|
|
|
|
checksum=checksum,
|
|
|
|
path=path,
|
|
|
|
paths=[customisation_layer]
|
|
|
|
)
|
2020-06-08 10:47:46 +01:00
|
|
|
|
|
|
|
|
2020-06-11 01:51:47 +01:00
|
|
|
def add_bytes(tar, path, content, mtime):
|
2020-06-21 01:09:22 +01:00
|
|
|
"""
|
|
|
|
Adds a file to the tarball with given path and contents.
|
|
|
|
|
|
|
|
tar: 'tarfile.TarFile' object.
|
|
|
|
path: Path of the file as a string.
|
|
|
|
content: Contents of the file.
|
|
|
|
mtime: 'mtime' of the file. Should be an integer representing a POSIX time.
|
|
|
|
"""
|
2020-06-08 10:47:46 +01:00
|
|
|
assert type(content) is bytes
|
|
|
|
|
|
|
|
ti = tarfile.TarInfo(path)
|
|
|
|
ti.size = len(content)
|
2020-06-11 01:51:47 +01:00
|
|
|
ti.mtime = mtime
|
2020-06-08 10:47:46 +01:00
|
|
|
tar.addfile(ti, io.BytesIO(content))
|
|
|
|
|
|
|
|
|
2020-06-21 00:46:15 +01:00
|
|
|
def main():
|
|
|
|
with open(sys.argv[1], "r") as f:
|
|
|
|
conf = json.load(f)
|
|
|
|
|
|
|
|
created = (
|
2020-07-09 08:34:18 +01:00
|
|
|
datetime.now(tz=timezone.utc)
|
2020-06-21 00:46:15 +01:00
|
|
|
if conf["created"] == "now"
|
|
|
|
else datetime.fromisoformat(conf["created"])
|
2020-06-11 01:44:04 +01:00
|
|
|
)
|
2020-06-21 00:46:15 +01:00
|
|
|
mtime = int(created.timestamp())
|
2021-03-08 11:24:29 +00:00
|
|
|
store_dir = conf["store_dir"]
|
2020-06-21 00:46:15 +01:00
|
|
|
|
2021-03-08 20:36:13 +00:00
|
|
|
from_image = load_from_image(conf["from_image"])
|
|
|
|
|
2020-06-21 00:46:15 +01:00
|
|
|
with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tar:
|
|
|
|
layers = []
|
2021-03-08 20:36:13 +00:00
|
|
|
layers.extend(add_base_layers(tar, from_image))
|
|
|
|
|
|
|
|
start = len(layers) + 1
|
|
|
|
for num, store_layer in enumerate(conf["store_layers"], start=start):
|
|
|
|
print("Creating layer", num, "from paths:", store_layer,
|
|
|
|
file=sys.stderr)
|
2021-03-08 11:24:29 +00:00
|
|
|
info = add_layer_dir(tar, store_layer, store_dir, mtime=mtime)
|
2020-06-21 00:46:15 +01:00
|
|
|
layers.append(info)
|
|
|
|
|
2021-03-08 20:36:13 +00:00
|
|
|
print("Creating layer", len(layers) + 1, "with customisation...",
|
|
|
|
file=sys.stderr)
|
2020-06-21 00:46:15 +01:00
|
|
|
layers.append(
|
|
|
|
add_customisation_layer(
|
|
|
|
tar,
|
|
|
|
conf["customisation_layer"],
|
|
|
|
mtime=mtime
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
print("Adding manifests...", file=sys.stderr)
|
|
|
|
|
|
|
|
image_json = {
|
|
|
|
"created": datetime.isoformat(created),
|
|
|
|
"architecture": conf["architecture"],
|
|
|
|
"os": "linux",
|
2021-03-08 20:36:13 +00:00
|
|
|
"config": overlay_base_config(from_image, conf["config"]),
|
2020-06-21 00:46:15 +01:00
|
|
|
"rootfs": {
|
|
|
|
"diff_ids": [f"sha256:{layer.checksum}" for layer in layers],
|
|
|
|
"type": "layers",
|
|
|
|
},
|
|
|
|
"history": [
|
|
|
|
{
|
2020-07-09 08:34:18 +01:00
|
|
|
"created": datetime.isoformat(created),
|
2020-06-21 00:46:15 +01:00
|
|
|
"comment": f"store paths: {layer.paths}"
|
|
|
|
}
|
|
|
|
for layer in layers
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
image_json = json.dumps(image_json, indent=4).encode("utf-8")
|
|
|
|
image_json_checksum = hashlib.sha256(image_json).hexdigest()
|
|
|
|
image_json_path = f"{image_json_checksum}.json"
|
|
|
|
add_bytes(tar, image_json_path, image_json, mtime=mtime)
|
2020-06-08 10:47:46 +01:00
|
|
|
|
2020-06-21 00:46:15 +01:00
|
|
|
manifest_json = [
|
2020-06-08 10:47:46 +01:00
|
|
|
{
|
2020-06-21 00:46:15 +01:00
|
|
|
"Config": image_json_path,
|
|
|
|
"RepoTags": [conf["repo_tag"]],
|
|
|
|
"Layers": [layer.path for layer in layers],
|
2020-06-08 10:47:46 +01:00
|
|
|
}
|
2020-06-21 00:46:15 +01:00
|
|
|
]
|
|
|
|
manifest_json = json.dumps(manifest_json, indent=4).encode("utf-8")
|
|
|
|
add_bytes(tar, "manifest.json", manifest_json, mtime=mtime)
|
|
|
|
|
|
|
|
print("Done.", file=sys.stderr)
|
|
|
|
|
2020-06-08 10:47:46 +01:00
|
|
|
|
2020-06-21 00:46:15 +01:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|