diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix
index 9dbb6a4d0d93..77b9c40577a3 100644
--- a/nixos/modules/services/computing/slurm/slurm.nix
+++ b/nixos/modules/services/computing/slurm/slurm.nix
@@ -29,12 +29,19 @@ let
${cfg.extraPlugstackConfig}
'';
-
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
''
${cfg.extraCgroupConfig}
'';
+ slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf"
+ ''
+ DbdHost=${cfg.dbdserver.dbdHost}
+ SlurmUser=${cfg.user}
+ StorageType=accounting_storage/mysql
+ ${cfg.dbdserver.extraConfig}
+ '';
+
# slurm expects some additional config files to be
# in the same directory as slurm.conf
etcSlurm = pkgs.symlinkJoin {
@@ -65,6 +72,27 @@ in
};
};
+ dbdserver = {
+ enable = mkEnableOption "SlurmDBD service";
+
+ dbdHost = mkOption {
+ type = types.str;
+ default = config.networking.hostName;
+ description = ''
+ Hostname of the machine where slurmdbd
+ is running (i.e. name returned by hostname -s).
+ '';
+ };
+
+ extraConfig = mkOption {
+ type = types.lines;
+ default = "";
+ description = ''
+ Extra configuration for slurmdbd.conf
+ '';
+ };
+ };
+
client = {
enable = mkEnableOption "slurm client daemon";
};
@@ -208,6 +236,8 @@ in
used when procTrackType=proctrack/cgroup.
'';
};
+
+
};
};
@@ -244,7 +274,10 @@ in
'';
};
- in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) {
+ in mkIf ( cfg.enableStools ||
+ cfg.client.enable ||
+ cfg.server.enable ||
+ cfg.dbdserver.enable ) {
environment.systemPackages = [ wrappedSlurm ];
@@ -301,6 +334,24 @@ in
'';
};
+ systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) {
+ path = with pkgs; [ wrappedSlurm munge coreutils ];
+
+ wantedBy = [ "multi-user.target" ];
+ after = [ "network.target" "munged.service" "mysql.service" ];
+ requires = [ "munged.service" "mysql.service" ];
+
+ # slurm strips the last component off the path
+ environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf";
+
+ serviceConfig = {
+ Type = "forking";
+ ExecStart = "${cfg.package}/bin/slurmdbd";
+ PIDFile = "/run/slurmdbd.pid";
+ ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+ };
+ };
+
};
}
diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix
index 60f44c3c8459..54ea1ee7894e 100644
--- a/nixos/tests/slurm.nix
+++ b/nixos/tests/slurm.nix
@@ -1,5 +1,7 @@
import ./make-test.nix ({ ... }:
-let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
+let
+ mungekey = "mungeverryweakkeybuteasytointegratoinatest";
+
slurmconfig = {
controlMachine = "control";
nodeName = ''
@@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
NodeName=node[1-3] CPUs=1 State=UNKNOWN
'';
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
+ extraConfig = ''
+ AccountingStorageHost=dbd
+ AccountingStorageType=accounting_storage/slurmdbd
+ '';
};
in {
name = "slurm";
@@ -16,7 +22,7 @@ in {
computeNode =
{ ...}:
{
- # TODO slrumd port and slurmctld port should be configurations and
+ # TODO slurmd port and slurmctld port should be configurations and
# automatically allowed by the firewall.
networking.firewall.enable = false;
services.slurm = {
@@ -43,6 +49,24 @@ in {
} // slurmconfig;
};
+ dbd =
+ { pkgs, ... } :
+ {
+ networking.firewall.enable = false;
+ services.slurm.dbdserver = {
+ enable = true;
+ };
+ services.mysql = {
+ enable = true;
+ package = pkgs.mysql;
+ ensureDatabases = [ "slurm_acct_db" ];
+ ensureUsers = [{
+ ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
+ name = "slurm";
+ }];
+ };
+ };
+
node1 = computeNode;
node2 = computeNode;
node3 = computeNode;
@@ -54,7 +78,7 @@ in {
startAll;
# Set up authentification across the cluster
- foreach my $node (($submit,$control,$node1,$node2,$node3))
+ foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
{
$node->waitForUnit("default.target");
@@ -63,10 +87,22 @@ in {
$node->succeed("chmod 0400 /etc/munge/munge.key");
$node->succeed("chown munge:munge /etc/munge/munge.key");
$node->succeed("systemctl restart munged");
- }
+
+ $node->waitForUnit("munged");
+ };
# Restart the services since they have probably failed due to the munge init
# failure
+ subtest "can_start_slurmdbd", sub {
+ $dbd->succeed("systemctl restart slurmdbd");
+ $dbd->waitForUnit("slurmdbd.service");
+ };
+
+ # there needs to be an entry for the current
+ # cluster in the database before slurmctld is restarted
+ subtest "add_account", sub {
+ $control->succeed("sacctmgr -i add cluster default");
+ };
subtest "can_start_slurmctld", sub {
$control->succeed("systemctl restart slurmctld");
@@ -81,12 +117,17 @@ in {
}
};
- # Test that the cluster work and can distribute jobs;
+ # Test that the cluster works and can distribute jobs;
subtest "run_distributed_command", sub {
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
};
+
+ subtest "check_slurm_dbd", sub {
+ # find the srun job from above in the database
+ $submit->succeed("sacct | grep hostname");
+ };
'';
})