2019-11-10 22:07:08 +00:00
|
|
|
|
import ./make-test-python.nix ({ lib, ... }:
|
2018-09-15 12:09:36 +01:00
|
|
|
|
let
|
|
|
|
|
mungekey = "mungeverryweakkeybuteasytointegratoinatest";
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
slurmconfig = {
|
|
|
|
|
controlMachine = "control";
|
2018-10-25 19:34:17 +01:00
|
|
|
|
nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ];
|
|
|
|
|
partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ];
|
2018-09-15 12:09:36 +01:00
|
|
|
|
extraConfig = ''
|
|
|
|
|
AccountingStorageHost=dbd
|
|
|
|
|
AccountingStorageType=accounting_storage/slurmdbd
|
|
|
|
|
'';
|
2015-12-25 14:55:07 +00:00
|
|
|
|
};
|
|
|
|
|
in {
|
|
|
|
|
name = "slurm";
|
|
|
|
|
|
2018-10-25 20:03:23 +01:00
|
|
|
|
meta.maintainers = [ lib.maintainers.markuskowa ];
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
nodes =
|
|
|
|
|
let
|
|
|
|
|
computeNode =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
2018-09-15 12:09:36 +01:00
|
|
|
|
# TODO slurmd port and slurmctld port should be configurations and
|
2015-12-25 14:55:07 +00:00
|
|
|
|
# automatically allowed by the firewall.
|
|
|
|
|
networking.firewall.enable = false;
|
2018-06-01 22:42:21 +01:00
|
|
|
|
services.slurm = {
|
|
|
|
|
client.enable = true;
|
|
|
|
|
} // slurmconfig;
|
2015-12-25 14:55:07 +00:00
|
|
|
|
};
|
|
|
|
|
in {
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
control =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2015-12-25 14:55:07 +00:00
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm = {
|
|
|
|
|
server.enable = true;
|
|
|
|
|
} // slurmconfig;
|
|
|
|
|
};
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
|
|
|
|
submit =
|
2018-07-20 21:56:59 +01:00
|
|
|
|
{ ...}:
|
2018-06-01 22:42:21 +01:00
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm = {
|
|
|
|
|
enableStools = true;
|
|
|
|
|
} // slurmconfig;
|
|
|
|
|
};
|
|
|
|
|
|
2018-09-15 12:09:36 +01:00
|
|
|
|
dbd =
|
|
|
|
|
{ pkgs, ... } :
|
|
|
|
|
{
|
|
|
|
|
networking.firewall.enable = false;
|
|
|
|
|
services.slurm.dbdserver = {
|
|
|
|
|
enable = true;
|
2019-11-10 20:28:09 +00:00
|
|
|
|
storagePass = "password123";
|
2018-09-15 12:09:36 +01:00
|
|
|
|
};
|
|
|
|
|
services.mysql = {
|
|
|
|
|
enable = true;
|
2019-11-10 20:28:09 +00:00
|
|
|
|
package = pkgs.mariadb;
|
|
|
|
|
initialScript = pkgs.writeText "mysql-init.sql" ''
|
|
|
|
|
CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123';
|
|
|
|
|
GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';
|
|
|
|
|
'';
|
2018-09-15 12:09:36 +01:00
|
|
|
|
ensureDatabases = [ "slurm_acct_db" ];
|
|
|
|
|
ensureUsers = [{
|
|
|
|
|
ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
|
|
|
|
|
name = "slurm";
|
|
|
|
|
}];
|
2018-11-22 12:21:37 +00:00
|
|
|
|
extraOptions = ''
|
|
|
|
|
# recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration
|
|
|
|
|
innodb_buffer_pool_size=1024M
|
|
|
|
|
innodb_log_file_size=64M
|
|
|
|
|
innodb_lock_wait_timeout=900
|
|
|
|
|
'';
|
2018-09-15 12:09:36 +01:00
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
node1 = computeNode;
|
|
|
|
|
node2 = computeNode;
|
|
|
|
|
node3 = computeNode;
|
|
|
|
|
};
|
|
|
|
|
|
2018-06-01 22:42:21 +01:00
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
testScript =
|
|
|
|
|
''
|
2019-11-10 22:07:08 +00:00
|
|
|
|
start_all()
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
|
|
|
|
# Set up authentification across the cluster
|
2019-11-10 22:07:08 +00:00
|
|
|
|
for node in [submit, control, dbd, node1, node2, node3]:
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
2019-11-10 22:07:08 +00:00
|
|
|
|
node.wait_for_unit("default.target")
|
|
|
|
|
|
|
|
|
|
node.succeed("mkdir /etc/munge")
|
|
|
|
|
node.succeed(
|
|
|
|
|
"echo '${mungekey}' > /etc/munge/munge.key"
|
|
|
|
|
)
|
|
|
|
|
node.succeed("chmod 0400 /etc/munge/munge.key")
|
|
|
|
|
node.succeed("chown munge:munge /etc/munge/munge.key")
|
|
|
|
|
node.succeed("systemctl restart munged")
|
|
|
|
|
|
|
|
|
|
node.wait_for_unit("munged")
|
2018-09-15 12:09:36 +01:00
|
|
|
|
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
|
|
|
|
# Restart the services since they have probably failed due to the munge init
|
|
|
|
|
# failure
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("can_start_slurmdbd"):
|
|
|
|
|
dbd.succeed("systemctl restart slurmdbd")
|
|
|
|
|
dbd.wait_for_unit("slurmdbd.service")
|
|
|
|
|
dbd.wait_for_open_port(6819)
|
2018-09-15 12:09:36 +01:00
|
|
|
|
|
|
|
|
|
# there needs to be an entry for the current
|
|
|
|
|
# cluster in the database before slurmctld is restarted
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("add_account"):
|
|
|
|
|
control.succeed("sacctmgr -i add cluster default")
|
|
|
|
|
# check for cluster entry
|
|
|
|
|
control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("can_start_slurmctld"):
|
|
|
|
|
control.succeed("systemctl restart slurmctld")
|
2019-12-24 18:51:18 +00:00
|
|
|
|
control.wait_for_unit("slurmctld.service")
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("can_start_slurmd"):
|
|
|
|
|
for node in [node1, node2, node3]:
|
|
|
|
|
node.succeed("systemctl restart slurmd.service")
|
|
|
|
|
node.wait_for_unit("slurmd")
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
2018-09-15 12:09:36 +01:00
|
|
|
|
# Test that the cluster works and can distribute jobs;
|
2015-12-25 14:55:07 +00:00
|
|
|
|
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("run_distributed_command"):
|
|
|
|
|
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
|
|
|
|
|
# The output must contain the 3 different names
|
|
|
|
|
submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
|
2018-09-15 12:09:36 +01:00
|
|
|
|
|
2019-11-10 22:07:08 +00:00
|
|
|
|
with subtest("check_slurm_dbd"):
|
|
|
|
|
# find the srun job from above in the database
|
|
|
|
|
control.succeed("sleep 5")
|
|
|
|
|
control.succeed("sacct | grep hostname")
|
2015-12-25 14:55:07 +00:00
|
|
|
|
'';
|
|
|
|
|
})
|