nixpkgs/nixos/tests/consul.nix

import ./make-test-python.nix ({pkgs, lib, ...}:

let
  # Settings for both servers and agents
  webUi = true;
  retry_interval = "1s";
  raft_multiplier = 1;

  defaultExtraConfig = {
    inherit retry_interval;
    performance = {
      inherit raft_multiplier;
    };
  };

  allConsensusServerHosts = [
    "192.168.1.1"
    "192.168.1.2"
    "192.168.1.3"
  ];

  allConsensusClientHosts = [
    "192.168.2.1"
    "192.168.2.2"
  ];

  firewallSettings = {
    # See https://www.consul.io/docs/install/ports.html
    allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
    allowedUDPPorts = [ 8301 8302 8600 ];
  };

  client = index: { pkgs, ... }:
    let
      ip = builtins.elemAt allConsensusClientHosts index;
    in
      {
        environment.systemPackages = [ pkgs.consul ];

        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
          { address = ip; prefixLength = 16; }
        ];
        networking.firewall = firewallSettings;

        services.consul = {
          enable = true;
          inherit webUi;
          extraConfig = defaultExtraConfig // {
            server = false;
            retry_join = allConsensusServerHosts;
            bind_addr = ip;
          };
        };
      };

  server = index: { pkgs, ... }:
    let
      numConsensusServers = builtins.length allConsensusServerHosts;
      thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
      ip = thisConsensusServerHost; # since we already use IPs to identify servers
    in
      {
        networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
          { address = ip; prefixLength = 16; }
        ];
        networking.firewall = firewallSettings;

        services.consul =
          assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
          {
            enable = true;
            inherit webUi;
            extraConfig = defaultExtraConfig // {
              server = true;
              bootstrap_expect = numConsensusServers;
              # Tell Consul that we never intend to drop below this many servers.
              # Ensures to not permanently lose consensus after temporary loss.
              # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
              autopilot.min_quorum = numConsensusServers;
              retry_join =
                # If there's only 1 node in the network, we allow self-join;
                # otherwise, the node must not try to join itself, and join only the other servers.
                # See https://github.com/hashicorp/consul/issues/2868
                if numConsensusServers == 1
                  then allConsensusServerHosts
                  else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
              bind_addr = ip;
            };
          };
      };
in {
  name = "consul";

  nodes = {
    server1 = server 0;
    server2 = server 1;
    server3 = server 2;

    client1 = client 0;
    client2 = client 1;
  };

  testScript = ''
    servers = [server1, server2, server3]
    machines = [server1, server2, server3, client1, client2]

    for m in machines:
        m.wait_for_unit("consul.service")


    def wait_for_healthy_servers():
        # See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
        # for why the `Voter` column of `list-peers` has that info.
        # TODO: The `grep true` relies on the fact that currently in
        #       the output like
        #           # consul operator raft list-peers
        #           Node     ID   Address           State     Voter  RaftProtocol
        #           server3  ...  192.168.1.3:8300  leader    true   3
        #           server2  ...  192.168.1.2:8300  follower  true   3
        #           server1  ...  192.168.1.1:8300  follower  false  3
        #       `Voter`is the only boolean column.
        #       Change this to the more reliable way to be defined by
        #       https://github.com/hashicorp/consul/issues/8118
        #       once that ticket is closed.
        for m in machines:
            m.wait_until_succeeds(
                "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
            )


    def wait_for_all_machines_alive():
        """
        Note that Serf-"alive" does not mean "Raft"-healthy;
        see `wait_for_healthy_servers()` for that instead.
        """
        for m in machines:
            m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")


    wait_for_healthy_servers()
    # Also wait for clients to be alive.
    wait_for_all_machines_alive()

    client1.succeed("consul kv put testkey 42")
    client2.succeed("[ $(consul kv get testkey) == 42 ]")


    def rolling_reboot_test(proper_rolling_procedure=True):
        """
        Tests that the cluster can tolearate failures of any single server,
        following the recommended rolling upgrade procedure from
        https://www.consul.io/docs/upgrading#standard-upgrades.

        Optionally, `proper_rolling_procedure=False` can be given
        to wait only for each server to be back `Healthy`, not `Stable`
        in the Raft consensus, see Consul setting `ServerStabilizationTime` and
        https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
        """

        for server in servers:
            server.crash()

            # For each client, wait until they have connection again
            # using `kv get -recurse` before issuing commands.
            client1.wait_until_succeeds("consul kv get -recurse")
            client2.wait_until_succeeds("consul kv get -recurse")

            # Do some consul actions while one server is down.
            client1.succeed("consul kv put testkey 43")
            client2.succeed("[ $(consul kv get testkey) == 43 ]")
            client2.succeed("consul kv delete testkey")

            # Restart crashed machine.
            server.start()

            if proper_rolling_procedure:
                # Wait for recovery.
                wait_for_healthy_servers()
            else:
                # NOT proper rolling upgrade procedure, see above.
                wait_for_all_machines_alive()

            # Wait for client connections.
            client1.wait_until_succeeds("consul kv get -recurse")
            client2.wait_until_succeeds("consul kv get -recurse")

            # Do some consul actions with server back up.
            client1.succeed("consul kv put testkey 44")
            client2.succeed("[ $(consul kv get testkey) == 44 ]")
            client2.succeed("consul kv delete testkey")


    def all_servers_crash_simultaneously_test():
        """
        Tests that the cluster will eventually come back after all
        servers crash simultaneously.
        """

        for server in servers:
            server.crash()

        for server in servers:
            server.start()

        # Wait for recovery.
        wait_for_healthy_servers()

        # Wait for client connections.
        client1.wait_until_succeeds("consul kv get -recurse")
        client2.wait_until_succeeds("consul kv get -recurse")

        # Do some consul actions with servers back up.
        client1.succeed("consul kv put testkey 44")
        client2.succeed("[ $(consul kv get testkey) == 44 ]")
        client2.succeed("consul kv delete testkey")


    # Run the tests.

    print("rolling_reboot_test()")
    rolling_reboot_test()

    print("all_servers_crash_simultaneously_test()")
    all_servers_crash_simultaneously_test()

    print("rolling_reboot_test(proper_rolling_procedure=False)")
    rolling_reboot_test(proper_rolling_procedure=False)
  '';
})
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`import ./make-test-python.nix ({pkgs, lib, ...}:`

			`let`
			`# Settings for both servers and agents`
			`webUi = true;`
			`retry_interval = "1s";`
			`raft_multiplier = 1;`

			`defaultExtraConfig = {`
			`inherit retry_interval;`
			`performance = {`
			`inherit raft_multiplier;`
			`};`
			`};`

			`allConsensusServerHosts = [`
			`"192.168.1.1"`
			`"192.168.1.2"`
			`"192.168.1.3"`
			`];`

			`allConsensusClientHosts = [`
			`"192.168.2.1"`
			`"192.168.2.2"`
			`];`

			`firewallSettings = {`
			`# See https://www.consul.io/docs/install/ports.html`
			`allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];`
			`allowedUDPPorts = [ 8301 8302 8600 ];`
			`};`

			`client = index: { pkgs, ... }:`
			`let`
			`ip = builtins.elemAt allConsensusClientHosts index;`
			`in`
			`{`
			`environment.systemPackages = [ pkgs.consul ];`

			`networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [`
			`{ address = ip; prefixLength = 16; }`
			`];`
			`networking.firewall = firewallSettings;`

			`services.consul = {`
			`enable = true;`
			`inherit webUi;`
			`extraConfig = defaultExtraConfig // {`
			`server = false;`
			`retry_join = allConsensusServerHosts;`
			`bind_addr = ip;`
			`};`
			`};`
			`};`

			`server = index: { pkgs, ... }:`
			`let`
consul.passthru.tests: Refactor: Extract variable 2020-06-18 00:49:24 +01:00			`numConsensusServers = builtins.length allConsensusServerHosts;`
consul.passthru.tests: Refactor let bindings 2020-06-18 00:48:19 +01:00			`thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;`
			`ip = thisConsensusServerHost; # since we already use IPs to identify servers`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`in`
			`{`
			`networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [`
consul.passthru.tests: Refactor let bindings 2020-06-18 00:48:19 +01:00			`{ address = ip; prefixLength = 16; }`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`];`
			`networking.firewall = firewallSettings;`

			`services.consul =`
			`assert builtins.elem thisConsensusServerHost allConsensusServerHosts;`
			`{`
			`enable = true;`
			`inherit webUi;`
			`extraConfig = defaultExtraConfig // {`
			`server = true;`
consul.passthru.tests: Refactor: Extract variable 2020-06-18 00:49:24 +01:00			`bootstrap_expect = numConsensusServers;`
consul.passthru.tests: Fix failure on current consul. Fixes #90613. Done by setting `autopilot.min_quorum = 3`. Techncially, this would have been required to keep the test correct since Consul's "autopilot" "Dead Server Cleanup" was enabled by default (I believe that was in Consul 0.8). Practically, the issue only occurred with our NixOS test with releases >= `1.7.0-beta2` (see #90613). The setting itself is available since Consul 1.6.2. However, this setting was not documented clearly enough for anybody to notice, and only the upstream issue https://github.com/hashicorp/consul/issues/8118 I filed brought that to light. As explained there, the test could also have been made pass by applying the more correct rolling reboot procedure -m.wait_until_succeeds("[ $(consul members \| grep -o alive \| wc -l) == 5 ]") +m.wait_until_succeeds( + "[ $(consul operator raft list-peers \| grep true \| wc -l) == 3 ]" +) but we also intend to test that Consul can regain consensus even if the quorum gets temporarily broken. 2020-06-18 01:08:17 +01:00			`# Tell Consul that we never intend to drop below this many servers.`
			`# Ensures to not permanently lose consensus after temporary loss.`
			`# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040`
			`autopilot.min_quorum = numConsensusServers;`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`retry_join =`
			`# If there's only 1 node in the network, we allow self-join;`
			`# otherwise, the node must not try to join itself, and join only the other servers.`
			`# See https://github.com/hashicorp/consul/issues/2868`
consul.passthru.tests: Refactor: Extract variable 2020-06-18 00:49:24 +01:00			`if numConsensusServers == 1`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`then allConsensusServerHosts`
			`else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;`
			`bind_addr = ip;`
			`};`
			`};`
			`};`
			`in {`
			`name = "consul";`

			`nodes = {`
			`server1 = server 0;`
			`server2 = server 1;`
			`server3 = server 2;`

			`client1 = client 0;`
			`client2 = client 1;`
			`};`

			`testScript = ''`
			`servers = [server1, server2, server3]`
			`machines = [server1, server2, server3, client1, client2]`

			`for m in machines:`
			`m.wait_for_unit("consul.service")`

consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00
			`def wait_for_healthy_servers():`
consul.passthru.tests: Use correct server health test. From: https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 2020-06-18 01:45:42 +01:00			`# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040`
			# for why the `Voter` column of `list-peers` has that info.
			# TODO: The `grep true` relies on the fact that currently in
			`# the output like`
			`# # consul operator raft list-peers`
			`# Node ID Address State Voter RaftProtocol`
			`# server3 ... 192.168.1.3:8300 leader true 3`
			`# server2 ... 192.168.1.2:8300 follower true 3`
			`# server1 ... 192.168.1.1:8300 follower false 3`
			# `Voter`is the only boolean column.
			`# Change this to the more reliable way to be defined by`
			`# https://github.com/hashicorp/consul/issues/8118`
			`# once that ticket is closed.`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`for m in machines:`
consul.passthru.tests: Use correct server health test. From: https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040 2020-06-18 01:45:42 +01:00			`m.wait_until_succeeds(`
			`"[ $(consul operator raft list-peers \| grep true \| wc -l) == 3 ]"`
			`)`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00

consul.passthru.tests: Refactor: Extract function 2020-06-18 02:05:54 +01:00			`def wait_for_all_machines_alive():`
			`"""`
			`Note that Serf-"alive" does not mean "Raft"-healthy;`
			see `wait_for_healthy_servers()` for that instead.
			`"""`
			`for m in machines:`
			`m.wait_until_succeeds("[ $(consul members \| grep -o alive \| wc -l) == 5 ]")`


consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`wait_for_healthy_servers()`
			`# Also wait for clients to be alive.`
consul.passthru.tests: Refactor: Extract function 2020-06-18 02:05:54 +01:00			`wait_for_all_machines_alive()`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00
			`client1.succeed("consul kv put testkey 42")`
			`client2.succeed("[ $(consul kv get testkey) == 42 ]")`


consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00			`def rolling_reboot_test(proper_rolling_procedure=True):`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`"""`
			`Tests that the cluster can tolearate failures of any single server,`
			`following the recommended rolling upgrade procedure from`
consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00			`https://www.consul.io/docs/upgrading#standard-upgrades.`

			Optionally, `proper_rolling_procedure=False` can be given
			to wait only for each server to be back `Healthy`, not `Stable`
			in the Raft consensus, see Consul setting `ServerStabilizationTime` and
			`https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`"""`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`for server in servers:`
			`server.crash()`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`# For each client, wait until they have connection again`
			# using `kv get -recurse` before issuing commands.
			`client1.wait_until_succeeds("consul kv get -recurse")`
			`client2.wait_until_succeeds("consul kv get -recurse")`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`# Do some consul actions while one server is down.`
			`client1.succeed("consul kv put testkey 43")`
			`client2.succeed("[ $(consul kv get testkey) == 43 ]")`
			`client2.succeed("consul kv delete testkey")`

			`# Restart crashed machine.`
			`server.start()`

consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00			`if proper_rolling_procedure:`
			`# Wait for recovery.`
			`wait_for_healthy_servers()`
			`else:`
			`# NOT proper rolling upgrade procedure, see above.`
			`wait_for_all_machines_alive()`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00
			`# Wait for client connections.`
			`client1.wait_until_succeeds("consul kv get -recurse")`
			`client2.wait_until_succeeds("consul kv get -recurse")`

			`# Do some consul actions with server back up.`
			`client1.succeed("consul kv put testkey 44")`
			`client2.succeed("[ $(consul kv get testkey) == 44 ]")`
			`client2.succeed("consul kv delete testkey")`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00

consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00			`def all_servers_crash_simultaneously_test():`
			`"""`
			`Tests that the cluster will eventually come back after all`
			`servers crash simultaneously.`
			`"""`

			`for server in servers:`
			`server.crash()`

			`for server in servers:`
			`server.start()`

			`# Wait for recovery.`
			`wait_for_healthy_servers()`

			`# Wait for client connections.`
			`client1.wait_until_succeeds("consul kv get -recurse")`
			`client2.wait_until_succeeds("consul kv get -recurse")`

			`# Do some consul actions with servers back up.`
			`client1.succeed("consul kv put testkey 44")`
			`client2.succeed("[ $(consul kv get testkey) == 44 ]")`
			`client2.succeed("consul kv delete testkey")`


consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`# Run the tests.`
consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00
			`print("rolling_reboot_test()")`
consul.passthru.tests: Refactor into functions. For better naming and commentary. 2020-06-18 01:43:11 +01:00			`rolling_reboot_test()`
consul.passthru.tests: Add 2 more tests 2020-06-18 02:06:24 +01:00
			`print("all_servers_crash_simultaneously_test()")`
			`all_servers_crash_simultaneously_test()`

			`print("rolling_reboot_test(proper_rolling_procedure=False)")`
			`rolling_reboot_test(proper_rolling_procedure=False)`
nixosTests.consul: init 2019-11-24 02:40:27 +00:00			`'';`
			`})`