diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index 9dbb6a4d0d93..77b9c40577a3 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -29,12 +29,19 @@ let ${cfg.extraPlugstackConfig} ''; - cgroupConfig = pkgs.writeTextDir "cgroup.conf" '' ${cfg.extraCgroupConfig} ''; + slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf" + '' + DbdHost=${cfg.dbdserver.dbdHost} + SlurmUser=${cfg.user} + StorageType=accounting_storage/mysql + ${cfg.dbdserver.extraConfig} + ''; + # slurm expects some additional config files to be # in the same directory as slurm.conf etcSlurm = pkgs.symlinkJoin { @@ -65,6 +72,27 @@ in }; }; + dbdserver = { + enable = mkEnableOption "SlurmDBD service"; + + dbdHost = mkOption { + type = types.str; + default = config.networking.hostName; + description = '' + Hostname of the machine where slurmdbd + is running (i.e. name returned by hostname -s). + ''; + }; + + extraConfig = mkOption { + type = types.lines; + default = ""; + description = '' + Extra configuration for slurmdbd.conf + ''; + }; + }; + client = { enable = mkEnableOption "slurm client daemon"; }; @@ -208,6 +236,8 @@ in used when procTrackType=proctrack/cgroup. ''; }; + + }; }; @@ -244,7 +274,10 @@ in ''; }; - in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) { + in mkIf ( cfg.enableStools || + cfg.client.enable || + cfg.server.enable || + cfg.dbdserver.enable ) { environment.systemPackages = [ wrappedSlurm ]; @@ -301,6 +334,24 @@ in ''; }; + systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) { + path = with pkgs; [ wrappedSlurm munge coreutils ]; + + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" "munged.service" "mysql.service" ]; + requires = [ "munged.service" "mysql.service" ]; + + # slurm strips the last component off the path + environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf"; + + serviceConfig = { + Type = "forking"; + ExecStart = "${cfg.package}/bin/slurmdbd"; + PIDFile = "/run/slurmdbd.pid"; + ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID"; + }; + }; + }; } diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 60f44c3c8459..54ea1ee7894e 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -1,5 +1,7 @@ import ./make-test.nix ({ ... }: -let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; +let + mungekey = "mungeverryweakkeybuteasytointegratoinatest"; + slurmconfig = { controlMachine = "control"; nodeName = '' @@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; NodeName=node[1-3] CPUs=1 State=UNKNOWN ''; partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd + ''; }; in { name = "slurm"; @@ -16,7 +22,7 @@ in { computeNode = { ...}: { - # TODO slrumd port and slurmctld port should be configurations and + # TODO slurmd port and slurmctld port should be configurations and # automatically allowed by the firewall. networking.firewall.enable = false; services.slurm = { @@ -43,6 +49,24 @@ in { } // slurmconfig; }; + dbd = + { pkgs, ... } : + { + networking.firewall.enable = false; + services.slurm.dbdserver = { + enable = true; + }; + services.mysql = { + enable = true; + package = pkgs.mysql; + ensureDatabases = [ "slurm_acct_db" ]; + ensureUsers = [{ + ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; + name = "slurm"; + }]; + }; + }; + node1 = computeNode; node2 = computeNode; node3 = computeNode; @@ -54,7 +78,7 @@ in { startAll; # Set up authentification across the cluster - foreach my $node (($submit,$control,$node1,$node2,$node3)) + foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3)) { $node->waitForUnit("default.target"); @@ -63,10 +87,22 @@ in { $node->succeed("chmod 0400 /etc/munge/munge.key"); $node->succeed("chown munge:munge /etc/munge/munge.key"); $node->succeed("systemctl restart munged"); - } + + $node->waitForUnit("munged"); + }; # Restart the services since they have probably failed due to the munge init # failure + subtest "can_start_slurmdbd", sub { + $dbd->succeed("systemctl restart slurmdbd"); + $dbd->waitForUnit("slurmdbd.service"); + }; + + # there needs to be an entry for the current + # cluster in the database before slurmctld is restarted + subtest "add_account", sub { + $control->succeed("sacctmgr -i add cluster default"); + }; subtest "can_start_slurmctld", sub { $control->succeed("systemctl restart slurmctld"); @@ -81,12 +117,17 @@ in { } }; - # Test that the cluster work and can distribute jobs; + # Test that the cluster works and can distribute jobs; subtest "run_distributed_command", sub { # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); }; + + subtest "check_slurm_dbd", sub { + # find the srun job from above in the database + $submit->succeed("sacct | grep hostname"); + }; ''; })