2018-10-25 20:03:23 +01:00
|
|
|
|
import ./make-test.nix ({ lib, ... }:
|
2018-09-15 12:09:36 +01:00
|
|
|
|
let
|
|
|
|
|
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
slurmconfig = {
|
|
|
|
|
controlMachine = "control";
|
2018-10-25 19:34:17 +01:00
|
|
|
|
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
|
|
|
|
|
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
|
2018-09-15 12:09:36 +01:00
|
|
|
|
extraConfig = ''
|
|
|
|
|
AccountingStorageHost=dbd
|
|
|
|
|
AccountingStorageType=accounting_storage/slurmdbd
|
|
|
|
|
'';
|
2015-12-25 14:55:07 +00:00
|
|
|
|
};
|
|
|
|
|
in {
|
|
|
|
|
name = "slurm";
|
|
|
|
|
|
2018-10-25 20:03:23 +01:00
|
|
|
|
meta.maintainers = [ lib.maintainers.markuskowa ];
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
nodes =
|
|
|
|
|
let
|
|
|
|
|
computeNode =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
2018-09-15 12:09:36 +01:00
|
|
|
|
# TODO slurmd port and slurmctld port should be configurations and
|
2015-12-25 14:55:07 +00:00
|
|
|
|
# automatically allowed by the firewall.
|
|
|
|
|
networking.firewall.enable = false;
|
2018-06-01 22:42:21 +01:00
|
|
|
|
services.slurm = {
|
|
|
|
|
client.enable = true;
|
|
|
|
|
} // slurmconfig;
|
2015-12-25 14:55:07 +00:00
|
|
|
|
};
|
|
|
|
|
in {
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
control =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm = {
|
|
|
|
|
server.enable = true;
|
|
|
|
|
} // slurmconfig;
|
|
|
|
|
};
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
|
|
|
|
submit =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2018-06-01 22:42:21 +01:00
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm = {
|
|
|
|
|
enableStools = true;
|
|
|
|
|
} // slurmconfig;
|
|
|
|
|
};
|
|
|
|
|
|
2018-09-15 12:09:36 +01:00
|
|
|
|
dbd =
|
|
|
|
|
{ pkgs, ... } :
|
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm.dbdserver = {
|
|
|
|
|
enable = true;
|
|
|
|
|
};
|
|
|
|
|
services.mysql = {
|
|
|
|
|
enable = true;
|
|
|
|
|
package = pkgs.mysql;
|
|
|
|
|
ensureDatabases = [ "slurm_acct_db" ];
|
|
|
|
|
ensureUsers = [{
|
|
|
|
|
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
|
|
|
|
|
name = "slurm";
|
|
|
|
|
}];
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
node1 = computeNode;
|
|
|
|
|
node2 = computeNode;
|
|
|
|
|
node3 = computeNode;
|
|
|
|
|
};
|
|
|
|
|
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
testScript =
|
|
|
|
|
''
|
|
|
|
|
startAll;
|
|
|
|
|
|
|
|
|
|
# Set up authentification across the cluster
|
2018-09-15 12:09:36 +01:00
|
|
|
|
foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
|
|
|
|
$node->waitForUnit("default.target");
|
|
|
|
|
|
|
|
|
|
$node->succeed("mkdir /etc/munge");
|
|
|
|
|
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
|
|
|
|
|
$node->succeed("chmod 0400 /etc/munge/munge.key");
|
2018-06-08 23:50:28 +01:00
|
|
|
|
$node->succeed("chown munge:munge /etc/munge/munge.key");
|
2015-12-25 14:55:07 +00:00
|
|
|
|
$node->succeed("systemctl restart munged");
|
2018-09-15 12:09:36 +01:00
|
|
|
|
|
|
|
|
|
$node->waitForUnit("munged");
|
|
|
|
|
};
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
|
|
|
|
# Restart the services since they have probably failed due to the munge init
|
|
|
|
|
# failure
|
2018-09-15 12:09:36 +01:00
|
|
|
|
subtest "can_start_slurmdbd", sub {
|
|
|
|
|
$dbd->succeed("systemctl restart slurmdbd");
|
|
|
|
|
$dbd->waitForUnit("slurmdbd.service");
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
# there needs to be an entry for the current
|
|
|
|
|
# cluster in the database before slurmctld is restarted
|
|
|
|
|
subtest "add_account", sub {
|
|
|
|
|
$control->succeed("sacctmgr -i add cluster default");
|
|
|
|
|
};
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
|
|
|
|
subtest "can_start_slurmctld", sub {
|
|
|
|
|
$control->succeed("systemctl restart slurmctld");
|
|
|
|
|
$control->waitForUnit("slurmctld.service");
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
subtest "can_start_slurmd", sub {
|
2018-06-01 22:42:21 +01:00
|
|
|
|
foreach my $node (($node1,$node2,$node3))
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
|
|
|
|
$node->succeed("systemctl restart slurmd.service");
|
|
|
|
|
$node->waitForUnit("slurmd");
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2018-09-15 12:09:36 +01:00
|
|
|
|
# Test that the cluster works and can distribute jobs;
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
|
|
|
|
subtest "run_distributed_command", sub {
|
|
|
|
|
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
|
|
|
|
# The output must contain the 3 different names
|
2018-06-01 22:42:21 +01:00
|
|
|
|
$submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
|
2015-12-25 14:55:07 +00:00
|
|
|
|
};
|
2018-09-15 12:09:36 +01:00
|
|
|
|
|
|
|
|
|
subtest "check_slurm_dbd", sub {
|
|
|
|
|
# find the srun job from above in the database
|
|
|
|
|
$submit->succeed("sacct | grep hostname");
|
|
|
|
|
};
|
2015-12-25 14:55:07 +00:00
|
|
|
|
'';
|
|
|
|
|
})
|