Containers: Use systemd-nspawn's --network-veth flag

Note that this causes the name of the host-side interface to change
from c-<name> to ve-<name>.
This commit is contained in:
Eelco Dolstra 2014-05-07 17:00:46 +02:00
parent 810680bcae
commit 6f7aaf10a5
5 changed files with 73 additions and 105 deletions

View File

@ -213,8 +213,8 @@ $ ping -c1 10.233.4.2
<para>Networking is implemented using a pair of virtual Ethernet
devices. The network interface in the container is called
<literal>eth0</literal>, while the matching interface in the host is
called <literal>c-<replaceable>container-name</replaceable></literal>
(e.g., <literal>c-foo</literal>). The container has its own network
called <literal>ve-<replaceable>container-name</replaceable></literal>
(e.g., <literal>ve-foo</literal>). The container has its own network
namespace and the <literal>CAP_NET_ADMIN</literal> capability, so it
can perform arbitrary network configuration such as setting up
firewall rules, without affecting or having access to the hosts
@ -228,11 +228,11 @@ on the host:
<programlisting>
networking.nat.enable = true;
networking.nat.internalInterfaces = ["c-+"];
networking.nat.internalInterfaces = ["ve-+"];
networking.nat.externalInterface = "eth0";
</programlisting>
where <literal>eth0</literal> should be replaced with the desired
external interface. Note that <literal>c-+</literal> is a wildcard
external interface. Note that <literal>ve-+</literal> is a wildcard
that matches all container interfaces.</para>
</section>

View File

@ -4,6 +4,28 @@
<title>Release notes</title>
<!--==================================================================-->
<section xml:id="sec-release-14.10">
<title>Release 14.10 (“Caterpillar”, 2014/10/??)</title>
<para>When upgrading from a previous release, please be aware of the
following incompatible changes:
<itemizedlist>
<listitem><para>The host side of a container virtual Ethernet pair
is now called <literal>ve-<replaceable>container-name</replaceable></literal>
rather than <literal>c-<replaceable>container-name</replaceable></literal>.</para></listitem>
</itemizedlist>
</para>
</section>
<!--==================================================================-->
<section xml:id="sec-release-14.04">

View File

@ -34,9 +34,8 @@ let
# Ignore peth* devices; on Xen, they're renamed physical
# Ethernet cards used for bridging. Likewise for vif* and tap*
# (Xen) and virbr* and vnet* (libvirt) and c-* and ctmp-* (NixOS
# containers).
denyinterfaces ${toString ignoredInterfaces} lo peth* vif* tap* tun* virbr* vnet* vboxnet* c-* ctmp-*
# (Xen) and virbr* and vnet* (libvirt).
denyinterfaces ${toString ignoredInterfaces} lo peth* vif* tap* tun* virbr* vnet* vboxnet*
${config.networking.dhcpcd.extraConfig}
'';

View File

@ -4,16 +4,6 @@ with lib;
let
runInNetns = pkgs.stdenv.mkDerivation {
name = "run-in-netns";
unpackPhase = "true";
buildPhase = ''
mkdir -p $out/bin
gcc ${./run-in-netns.c} -o $out/bin/run-in-netns
'';
installPhase = "true";
};
nixos-container = pkgs.substituteAll {
name = "nixos-container";
dir = "bin";
@ -23,6 +13,28 @@ let
inherit (pkgs) socat;
};
# The container's init script, a small wrapper around the regular
# NixOS stage-2 init script.
containerInit = pkgs.writeScript "container-init"
''
#! ${pkgs.stdenv.shell} -e
# Initialise the container side of the veth pair.
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link set host0 name eth0
ip link set dev eth0 up
if [ -n "$HOST_ADDRESS" ]; then
ip route add $HOST_ADDRESS dev eth0
ip route add default via $HOST_ADDRESS
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip addr add $LOCAL_ADDRESS dev eth0
fi
fi
exec "$1"
'';
system = config.nixpkgs.system;
in
@ -70,7 +82,7 @@ in
Whether to give the container its own private virtual
Ethernet interface. The interface is called
<literal>eth0</literal>, and is hooked up to the interface
<literal>c-<replaceable>container-name</replaceable></literal>
<literal>ve-<replaceable>container-name</replaceable></literal>
on the host. If this option is not set, then the
container shares the network interfaces of the host,
and can bind to any port on any interface.
@ -176,39 +188,8 @@ in
"/nix/var/nix/profiles/per-container/$INSTANCE" \
"/nix/var/nix/gcroots/per-container/$INSTANCE"
if [ -f "/etc/containers/$INSTANCE.conf" ]; then
. "/etc/containers/$INSTANCE.conf"
fi
# Cleanup from last time.
ifaceHost=c-$INSTANCE
ifaceCont=ctmp-$INSTANCE
ns=net-$INSTANCE
ip netns del $ns 2> /dev/null || true
ip link del $ifaceHost 2> /dev/null || true
ip link del $ifaceCont 2> /dev/null || true
if [ "$PRIVATE_NETWORK" = 1 ]; then
# Create a pair of virtual ethernet devices. On the host,
# we get c-<container-name, and on the guest, we get
# eth0.
ip link add $ifaceHost type veth peer name $ifaceCont
ip netns add $ns
ip link set $ifaceCont netns $ns
ip netns exec $ns ip link set $ifaceCont name eth0
ip netns exec $ns ip link set dev eth0 up
ip link set dev $ifaceHost up
if [ -n "$HOST_ADDRESS" ]; then
ip addr add $HOST_ADDRESS dev $ifaceHost
ip netns exec $ns ip route add $HOST_ADDRESS dev eth0
ip netns exec $ns ip route add default via $HOST_ADDRESS
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip netns exec $ns ip addr add $LOCAL_ADDRESS dev eth0
ip route add $LOCAL_ADDRESS dev $ifaceHost
fi
runInNetNs="${runInNetns}/bin/run-in-netns $ns"
extraFlags="--capability=CAP_NET_ADMIN"
extraFlags="--network-veth"
fi
# If the host is 64-bit and the container is 32-bit, add a
@ -219,7 +200,7 @@ in
fi
''}
exec $runInNetNs ${config.systemd.package}/bin/systemd-nspawn \
exec ${config.systemd.package}/bin/systemd-nspawn \
--keep-unit \
-M "$INSTANCE" -D "$root" $extraFlags \
--bind-ro=/nix/store \
@ -227,7 +208,11 @@ in
--bind-ro=/nix/var/nix/daemon-socket \
--bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
--bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
"''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
--setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
--setenv HOST_ADDRESS="$HOST_ADDRESS" \
--setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
--setenv PATH="$PATH" \
${containerInit} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
'';
postStart =
@ -237,6 +222,17 @@ in
# until the start timeout expires if systemd-nspawn exits.
read x < $root/var/lib/startup-done
rm -f $root/var/lib/startup-done
if [ "$PRIVATE_NETWORK" = 1 ]; then
ifaceHost=ve-$INSTANCE
ip link set dev $ifaceHost up
if [ -n "$HOST_ADDRESS" ]; then
ip addr add $HOST_ADDRESS dev $ifaceHost
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip route add $LOCAL_ADDRESS dev $ifaceHost
fi
fi
'';
preStop =
@ -251,14 +247,13 @@ in
''
#! ${pkgs.stdenv.shell} -e
SYSTEM_PATH=/nix/var/nix/profiles/system
if [ -f "/etc/containers/$INSTANCE.conf" ]; then
. "/etc/containers/$INSTANCE.conf"
fi
echo $SYSTEM_PATH/bin/switch-to-configuration test | \
${pkgs.socat}/bin/socat unix:$root/var/lib/run-command.socket -
'';
serviceConfig.SyslogIdentifier = "container %i";
serviceConfig.EnvironmentFile = "-/etc/containers/%i.conf";
};
# Generate a configuration file in /etc/containers for each
@ -288,6 +283,8 @@ in
${cfg.localAddress} ${name}.containers
'') config.containers);
networking.dhcpcd.denyInterfaces = [ "ve-*" ];
environment.systemPackages = [ nixos-container ];
};

View File

@ -1,50 +0,0 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <fcntl.h>
#include <linux/limits.h>
int main(int argc, char * * argv)
{
if (argc < 3) {
fprintf(stderr, "%s: missing arguments\n", argv[0]);
return 1;
}
char nsPath[PATH_MAX];
sprintf(nsPath, "/run/netns/%s", argv[1]);
int fd = open(nsPath, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "%s: opening network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
if (setns(fd, CLONE_NEWNET) == -1) {
fprintf(stderr, "%s: setting network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
umount2(nsPath, MNT_DETACH);
if (unlink(nsPath) == -1) {
fprintf(stderr, "%s: unlinking network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
/* FIXME: Remount /sys so that /sys/class/net reflects the
interfaces visible in the network namespace. This requires
bind-mounting /sys/fs/cgroups etc. */
execv(argv[2], argv + 2);
fprintf(stderr, "%s: running command: %s\n", argv[0], strerror(errno));
return 1;
}