nixos/slurm: add slurmdbd to module
* New options "services.slurm.dbdserver.[enable,config]" * Add slurmdbd to test slurm.nix
This commit is contained in:
parent
111d4eb090
commit
79c9dbfb40
@ -29,12 +29,19 @@ let
|
||||
${cfg.extraPlugstackConfig}
|
||||
'';
|
||||
|
||||
|
||||
cgroupConfig = pkgs.writeTextDir "cgroup.conf"
|
||||
''
|
||||
${cfg.extraCgroupConfig}
|
||||
'';
|
||||
|
||||
slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf"
|
||||
''
|
||||
DbdHost=${cfg.dbdserver.dbdHost}
|
||||
SlurmUser=${cfg.user}
|
||||
StorageType=accounting_storage/mysql
|
||||
${cfg.dbdserver.extraConfig}
|
||||
'';
|
||||
|
||||
# slurm expects some additional config files to be
|
||||
# in the same directory as slurm.conf
|
||||
etcSlurm = pkgs.symlinkJoin {
|
||||
@ -65,6 +72,27 @@ in
|
||||
};
|
||||
};
|
||||
|
||||
dbdserver = {
|
||||
enable = mkEnableOption "SlurmDBD service";
|
||||
|
||||
dbdHost = mkOption {
|
||||
type = types.str;
|
||||
default = config.networking.hostName;
|
||||
description = ''
|
||||
Hostname of the machine where <literal>slurmdbd</literal>
|
||||
is running (i.e. name returned by <literal>hostname -s</literal>).
|
||||
'';
|
||||
};
|
||||
|
||||
extraConfig = mkOption {
|
||||
type = types.lines;
|
||||
default = "";
|
||||
description = ''
|
||||
Extra configuration for <literal>slurmdbd.conf</literal>
|
||||
'';
|
||||
};
|
||||
};
|
||||
|
||||
client = {
|
||||
enable = mkEnableOption "slurm client daemon";
|
||||
};
|
||||
@ -208,6 +236,8 @@ in
|
||||
used when <literal>procTrackType=proctrack/cgroup</literal>.
|
||||
'';
|
||||
};
|
||||
|
||||
|
||||
};
|
||||
|
||||
};
|
||||
@ -244,7 +274,10 @@ in
|
||||
'';
|
||||
};
|
||||
|
||||
in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) {
|
||||
in mkIf ( cfg.enableStools ||
|
||||
cfg.client.enable ||
|
||||
cfg.server.enable ||
|
||||
cfg.dbdserver.enable ) {
|
||||
|
||||
environment.systemPackages = [ wrappedSlurm ];
|
||||
|
||||
@ -301,6 +334,24 @@ in
|
||||
'';
|
||||
};
|
||||
|
||||
systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) {
|
||||
path = with pkgs; [ wrappedSlurm munge coreutils ];
|
||||
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" "munged.service" "mysql.service" ];
|
||||
requires = [ "munged.service" "mysql.service" ];
|
||||
|
||||
# slurm strips the last component off the path
|
||||
environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf";
|
||||
|
||||
serviceConfig = {
|
||||
Type = "forking";
|
||||
ExecStart = "${cfg.package}/bin/slurmdbd";
|
||||
PIDFile = "/run/slurmdbd.pid";
|
||||
ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
|
||||
};
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
import ./make-test.nix ({ ... }:
|
||||
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||||
let
|
||||
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||||
|
||||
slurmconfig = {
|
||||
controlMachine = "control";
|
||||
nodeName = ''
|
||||
@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
||||
NodeName=node[1-3] CPUs=1 State=UNKNOWN
|
||||
'';
|
||||
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
|
||||
extraConfig = ''
|
||||
AccountingStorageHost=dbd
|
||||
AccountingStorageType=accounting_storage/slurmdbd
|
||||
'';
|
||||
};
|
||||
in {
|
||||
name = "slurm";
|
||||
@ -16,7 +22,7 @@ in {
|
||||
computeNode =
|
||||
{ ...}:
|
||||
{
|
||||
# TODO slrumd port and slurmctld port should be configurations and
|
||||
# TODO slurmd port and slurmctld port should be configurations and
|
||||
# automatically allowed by the firewall.
|
||||
networking.firewall.enable = false;
|
||||
services.slurm = {
|
||||
@ -43,6 +49,24 @@ in {
|
||||
} // slurmconfig;
|
||||
};
|
||||
|
||||
dbd =
|
||||
{ pkgs, ... } :
|
||||
{
|
||||
networking.firewall.enable = false;
|
||||
services.slurm.dbdserver = {
|
||||
enable = true;
|
||||
};
|
||||
services.mysql = {
|
||||
enable = true;
|
||||
package = pkgs.mysql;
|
||||
ensureDatabases = [ "slurm_acct_db" ];
|
||||
ensureUsers = [{
|
||||
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
|
||||
name = "slurm";
|
||||
}];
|
||||
};
|
||||
};
|
||||
|
||||
node1 = computeNode;
|
||||
node2 = computeNode;
|
||||
node3 = computeNode;
|
||||
@ -54,7 +78,7 @@ in {
|
||||
startAll;
|
||||
|
||||
# Set up authentification across the cluster
|
||||
foreach my $node (($submit,$control,$node1,$node2,$node3))
|
||||
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
|
||||
{
|
||||
$node->waitForUnit("default.target");
|
||||
|
||||
@ -63,10 +87,22 @@ in {
|
||||
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
||||
$node->succeed("chown munge:munge /etc/munge/munge.key");
|
||||
$node->succeed("systemctl restart munged");
|
||||
}
|
||||
|
||||
$node->waitForUnit("munged");
|
||||
};
|
||||
|
||||
# Restart the services since they have probably failed due to the munge init
|
||||
# failure
|
||||
subtest "can_start_slurmdbd", sub {
|
||||
$dbd->succeed("systemctl restart slurmdbd");
|
||||
$dbd->waitForUnit("slurmdbd.service");
|
||||
};
|
||||
|
||||
# there needs to be an entry for the current
|
||||
# cluster in the database before slurmctld is restarted
|
||||
subtest "add_account", sub {
|
||||
$control->succeed("sacctmgr -i add cluster default");
|
||||
};
|
||||
|
||||
subtest "can_start_slurmctld", sub {
|
||||
$control->succeed("systemctl restart slurmctld");
|
||||
@ -81,12 +117,17 @@ in {
|
||||
}
|
||||
};
|
||||
|
||||
# Test that the cluster work and can distribute jobs;
|
||||
# Test that the cluster works and can distribute jobs;
|
||||
|
||||
subtest "run_distributed_command", sub {
|
||||
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
||||
# The output must contain the 3 different names
|
||||
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
||||
};
|
||||
|
||||
subtest "check_slurm_dbd", sub {
|
||||
# find the srun job from above in the database
|
||||
$submit->succeed("sacct | grep hostname");
|
||||
};
|
||||
'';
|
||||
})
|
||||
|
Loading…
Reference in New Issue
Block a user