diff --git a/group_vars/htcondor-manager.yml b/group_vars/htcondor-manager.yml new file mode 100644 index 000000000..5ae1c01d2 --- /dev/null +++ b/group_vars/htcondor-manager.yml @@ -0,0 +1,3 @@ +# Configure the HTCondor central manager node. +--- +htcondor_role_manager: true diff --git a/group_vars/central-manager-secondary-host.yml b/group_vars/htcondor-secondary-submit-host.yml similarity index 100% rename from group_vars/central-manager-secondary-host.yml rename to group_vars/htcondor-secondary-submit-host.yml diff --git a/group_vars/htcondor-secondary/vars.yml b/group_vars/htcondor-secondary/vars.yml new file mode 100644 index 000000000..76b5c7a08 --- /dev/null +++ b/group_vars/htcondor-secondary/vars.yml @@ -0,0 +1,8 @@ +# Configure nodes in the secondary HTCondor cluster. +# +# Nodes in the secondary HTCondor cluster belong both to the +# "htcondor-secondary" (with group priority > 1) and "htcondor" groups. They +# thus inherit variables from the latter. +--- +htcondor_server: "build.galaxyproject.eu" +htcondor_port: 9628 diff --git a/group_vars/htcondor-secondary/vault.yml b/group_vars/htcondor-secondary/vault.yml new file mode 100644 index 000000000..728b73381 --- /dev/null +++ b/group_vars/htcondor-secondary/vault.yml @@ -0,0 +1,10 @@ +$ANSIBLE_VAULT;1.1;AES256 +31353533313831356632376636636564653732313930623263376437313362386632623732306136 +3465326632326138646330353164336363653764396237370a393562613834343765313835656362 +66633030353534663831323939386335316130343137396139633038366438613731376130663564 +6635643366613463390a663637643834366632643730666131323737633966393335343734663731 +63346138623034333265633465376633313537313062633633353261623934333037646532303132 +63643364633136613265333461623036313964383932336335623236623462316437303964346163 +32386236303765353936333563303934323964383039626233613333396431383936326530343931 +33636531343831663864373365613036333964343534616664356462383066623238326138373435 +3566 diff --git a/group_vars/htcondor-submit.yml b/group_vars/htcondor-submit.yml new file mode 100644 index 000000000..9309d2c23 --- /dev/null +++ b/group_vars/htcondor-submit.yml @@ -0,0 +1,3 @@ +# Configure HTCondor submit nodes. +--- +htcondor_role_submit: true diff --git a/group_vars/htcondor/vars.yml b/group_vars/htcondor/vars.yml new file mode 100644 index 000000000..f486f03bc --- /dev/null +++ b/group_vars/htcondor/vars.yml @@ -0,0 +1,53 @@ +# Configure nodes in the HTCondor cluster. +--- +htcondor_server: "condor-cm.galaxyproject.eu" +htcondor_domain: bi.uni-freiburg.de +htcondor_port: 9618 +htcondor_version: 23.0 +htcondor_channel: 23.0 +htcondor_firewall_condor: "{{ true if htcondor_port == 9618 else false }}" +htcondor_firewall_nfs: false +htcondor_role_execute: false +htcondor_role_manager: false +htcondor_role_submit: false +htcondor_password: "{{ vault_htcondor_password }}" + +# Settings specific to the `condor_config.local.j2` configuration file. +htcondor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28" +htcondor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)" +htcondor_allow_administrator: "$(ALLOW_NEGOTIATOR)" +htcondor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}" +htcondor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}" +# htcondor_network_interface -> Defined per-host in host_vars. +htcondor_master_update_interval: 150 +htcondor_classad_lifetime: 300 +htcondor_negotiator_interval: 15 +htcondor_negotiator_update_interval: 100 +htcondor_schedd_interval: 60 +htcondor_job_start_count: 250 +htcondor_job_start_delay: 0 +htcondor_claim_worklife: 120 +htcondor_negotiator_post_job_rank: "isUndefined(RemoteOwner) * (10000 - TotalLoadAvg)" + +# Settings specific to the `usegalaxy_eu.htcondor` role (to be replaced with +# `grycap.htcondor`). +condor_host: "{{ htcondor_server }}" +condor_fs_domain: "{{ htcondor_domain }}" +condor_uid_domain: "{{ htcondor_domain }}" +condor_allow_write: "{{ htcondor_allow_write }}" +# condor_daemons -> Defined per-host in host_vars. +condor_allow_negotiator: "{{ htcondor_allow_negotiator }}" +condor_allow_administrator: "{{ htcondor_allow_administrator }}" +condor_system_periodic_hold: "{{ htcondor_system_periodic_hold }}" +condor_system_periodic_remove: "{{ htcondor_system_periodic_remove }}" +condor_network_interface: "{{ htcondor_network_interface }}" +condor_extra: | + MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }} + CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }} + NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }} + NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }} + SCHEDD_INTERVAL = {{ htcondor_schedd_interval }} + JOB_START_COUNT = {{ htcondor_job_start_count }} + JOB_START_DELAY = {{ htcondor_job_start_delay }} + CLAIM_WORKLIFE = {{ htcondor_claim_worklife }} + NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }} diff --git a/group_vars/htcondor/vault.yml b/group_vars/htcondor/vault.yml new file mode 100644 index 000000000..af51099b9 --- /dev/null +++ b/group_vars/htcondor/vault.yml @@ -0,0 +1,10 @@ +$ANSIBLE_VAULT;1.1;AES256 +36336166336332656436376537343036353234366164616236393139313932343538313133373639 +3064333637333539353566396361666362666539353231360a646430356366343632633637326462 +39333232646363656438316533666664613935353336313064323038313564383734373433656330 +3161396636623764660a636332303565396630666134626235636363636434623537333933653537 +37383165643433633630353961623930653139653132303235306539613332346662323764356563 +65303062333738616266383339366165643264633038323533306365623034656563333731393465 +66386263353433303832363936323138386637636366663338336263323835663730616639393831 +32333161633131323534306565626530616364386261646439336436303834386265396161333133 +3130 diff --git a/group_vars/sn06.yml b/group_vars/sn06.yml index e425c40b9..dc40cc245 100644 --- a/group_vars/sn06.yml +++ b/group_vars/sn06.yml @@ -222,33 +222,6 @@ galaxy_systemd_memory_limit: 120 galaxy_systemd_memory_limit_handler: 30 galaxy_systemd_memory_limit_workflow: 15 -# HTCondor -condor_host: "condor-cm.galaxyproject.eu" -condor_fs_domain: bi.uni-freiburg.de -condor_uid_domain: bi.uni-freiburg.de -condor_allow_write: "10.5.68.0/24, 132.230.223.0/24,132.230.153.0/28" -condor_daemons: - - COLLECTOR - - NEGOTIATOR - - MASTER - - SCHEDD -condor_allow_negotiator: "132.230.223.239,$(CONDOR_HOST),$(ALLOW_WRITE)" -condor_allow_administrator: "$(ALLOW_NEGOTIATOR)" - -condor_system_periodic_hold: "{{ 30 * 24 * 60 * 60 }}" -condor_system_periodic_remove: "{{ 2 * 24 * 60 * 60 }}" -condor_network_interface: ens802f0.223 -condor_extra: | - MASTER_UPDATE_INTERVAL = 150 - CLASSAD_LIFETIME = 300 - NEGOTIATOR_INTERVAL = 15 - NEGOTIATOR_UPDATE_INTERVAL = 100 - SCHEDD_INTERVAL = 60 - JOB_START_COUNT = 250 - JOB_START_DELAY = 0 - CLAIM_WORKLIFE = 120 - NEGOTIATOR_POST_JOB_RANK = isUndefined(RemoteOwner) * (10000 - TotalLoadAvg) - # gie_proxy gie_proxy_dir: "{{ galaxy_root }}/gie-proxy/proxy" gie_proxy_git_version: main diff --git a/host_vars/build.galaxyproject.eu.yml b/host_vars/build.galaxyproject.eu.yml new file mode 100644 index 000000000..d32f2b473 --- /dev/null +++ b/host_vars/build.galaxyproject.eu.yml @@ -0,0 +1,2 @@ +--- +htcondor_network_interface: ens802f0.223 diff --git a/host_vars/nspawn-htcondor.sn06.galaxyproject.eu.yml b/host_vars/nspawn-htcondor.sn06.galaxyproject.eu.yml new file mode 100644 index 000000000..d32f2b473 --- /dev/null +++ b/host_vars/nspawn-htcondor.sn06.galaxyproject.eu.yml @@ -0,0 +1,2 @@ +--- +htcondor_network_interface: ens802f0.223 diff --git a/host_vars/sn06.galaxyproject.eu.yml b/host_vars/sn06.galaxyproject.eu.yml new file mode 100644 index 000000000..ef317b889 --- /dev/null +++ b/host_vars/sn06.galaxyproject.eu.yml @@ -0,0 +1,9 @@ +--- +htcondor_network_interface: ens802f0.223 + +# Settings specific to the `usegalaxy_eu.htcondor` role. +condor_daemons: + - COLLECTOR + - NEGOTIATOR + - MASTER + - SCHEDD diff --git a/hosts b/hosts index 8b44d81ee..f4f308a6c 100644 --- a/hosts +++ b/hosts @@ -76,14 +76,50 @@ maintenance.galaxyproject.eu [all:vars] ansible_ssh_user=centos -[central-manager] -manager.vgcn.galaxyproject.eu ansible_ssh_user=root +[htcondor:children] +htcondor-manager +htcondor-submit +htcondor-secondary -[central-manager-secondary] -manager-secondary.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=manager-secondary.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q root@sn06.galaxyproject.eu"' +[htcondor-manager] +sn06.galaxyproject.eu + +[htcondor-manager:children] +htcondor-secondary-manager + +[htcondor-manager:vars] +ansible_group_priority=2 + +[htcondor-submit] +sn06.galaxyproject.eu + +[htcondor-submit:children] +htcondor-secondary-submit + +[htcondor-submit:vars] +ansible_group_priority=2 -[central-manager-secondary-host] -sn06.galaxyproject.eu ansible_ssh_user=root +[htcondor-secondary:children] +htcondor-secondary-manager +htcondor-secondary-submit + +[htcondor-secondary:vars] +ansible_group_priority=3 + +[htcondor-secondary-manager] +build.galaxyproject.eu ansible_ssh_user=root + +[htcondor-secondary-manager:vars] +ansible_group_priority=4 + +[htcondor-secondary-submit] +nspawn-htcondor.sn06.galaxyproject.eu ansible_host=127.0.0.1 ansible_port=2222 ansible_ssh_user=root ansible_ssh_common_args='-o HostKeyAlias=nspawn-htcondor.sn06.galaxyproject.eu -o ProxyCommand="ssh -W %h:%p -q centos@sn06.galaxyproject.eu"' + +[htcondor-secondary-submit:vars] +ansible_group_priority=4 + +[htcondor-secondary-submit-host] +sn06.galaxyproject.eu -[central-manager-secondary-host:vars] +[htcondor-secondary-submit-host:vars] ansible_group_priority=2 diff --git a/htcondor-secondary.yml b/htcondor.yml similarity index 87% rename from htcondor-secondary.yml rename to htcondor.yml index 455efeb54..597414625 100644 --- a/htcondor-secondary.yml +++ b/htcondor.yml @@ -1,6 +1,6 @@ --- - name: Systemd-nspawn container aimed at running a second HTCondor installation. - hosts: central-manager-secondary-host + hosts: htcondor-secondary-submit-host handlers: - name: Reload sshd # (in the container) when: nspawn_ssh | default(no) @@ -11,7 +11,7 @@ changed_when: true vars_files: - mounts/mountpoints.yml - - secret_group_vars/central-manager-secondary-host.yml + - secret_group_vars/htcondor-secondary-submit-host.yml pre_tasks: # Because it is already disabled for sn06 and this setup is needed just # temporarily. @@ -177,5 +177,38 @@ key: "[127.0.0.1]:{{ nspawn_ssh_config.Port }} {{ nspawn_ssh_host_key.content | b64decode }}" when: nspawn_ssh_host_trust_container -- name: Secondary HTCondor 10 cluster. - hosts: central-manager-secondary +- name: HTCondor cluster. + hosts: htcondor:!sn06.galaxyproject.eu + handlers: + - name: Reload HTCondor + when: "'condor_service' in service_facts.ansible_facts.services and \ + service_facts.ansible_facts.services['condor.service'].state == 'running'" + become: true + ansible.builtin.service: + name: condor + state: reloaded + pre_tasks: + - name: Ensure the HTCondor configuration directory exists. + become: true + ansible.builtin.file: + path: /etc/condor + state: directory + owner: root + group: root + mode: "0755" + + - name: Template HTCondor configuration. + become: true + ansible.builtin.template: + src: htcondor/condor_config.local.j2 + dest: /etc/condor/condor_config.local + owner: root + group: root + mode: "0644" + notify: Reload HTCondor + + - name: Check if HTCondor is running. + ansible.builtin.service_facts: + register: service_facts + roles: + - grycap.htcondor diff --git a/requirements.yaml b/requirements.yaml index 02fad7060..3e2162dce 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -88,6 +88,9 @@ roles: version: 0.0.1 - name: usegalaxy_eu.htcondor version: 1.0.1 + - name: grycap.htcondor + src: https://github.com/kysrpex/grycap-ansible-role-htcondor + version: d9a4aab0052dfb31d48c986d39a7f5e3692abba4 - name: usegalaxy-eu.update-hosts src: https://github.com/usegalaxy-eu/ansible-update-hosts version: 0.2.0 diff --git a/secret_group_vars/central-manager-secondary-host.yml b/secret_group_vars/htcondor-secondary-submit-host.yml similarity index 100% rename from secret_group_vars/central-manager-secondary-host.yml rename to secret_group_vars/htcondor-secondary-submit-host.yml diff --git a/templates/htcondor/condor_config.local.j2 b/templates/htcondor/condor_config.local.j2 new file mode 100644 index 000000000..50d37cbfb --- /dev/null +++ b/templates/htcondor/condor_config.local.j2 @@ -0,0 +1,52 @@ +# Networking +CONDOR_HOST = {{ htcondor_server }} +COLLECTOR_HOST = $(CONDOR_HOST):{{ htcondor_port }} +SHARED_PORT_PORT = {{ htcondor_port }} +{% if "htcondor-secondary" in group_names %} +WANT_UDP_COMMAND_SOCKET = False +UPDATE_COLLECTOR_WITH_TCP = True +UPDATE_VIEW_COLLECTOR_WITH_TCP = True +{% endif %} +{% if htcondor_network_interface is defined %} +NETWORK_INTERFACE = {{ htcondor_network_interface }} +{% endif %} + +# Security +ALLOW_WRITE = {{ htcondor_allow_write }} +ALLOW_READ = $(ALLOW_WRITE) +ALLOW_NEGOTIATOR = {{ htcondor_allow_negotiator }} +{% if htcondor_allow_administrator is defined %} +ALLOW_ADMINISTRATOR = {{ htcondor_allow_administrator }} +{% endif %} +ALLOW_OWNER = $(ALLOW_ADMINISTRATOR) +ALLOW_CLIENT = * +FILESYSTEM_DOMAIN = {{ htcondor_fs_domain }} +UID_DOMAIN = {{ htcondor_uid_domain }} +TRUST_UID_DOMAIN = True +SOFT_UID_DOMAIN = True +SEC_CLIENT_AUTHENTICATION_METHODS = IDTOKENS, FS +SEC_READ_AUTHENTICATION_METHODS = IDTOKENS, FS + +# Job management +{% if htcondor_system_periodic_hold is defined %} +SYSTEM_PERIODIC_HOLD = \ + (JobStatus == 1 || JobStatus == 2) && \ + ((time() - JobStartDate) >= ({{ htcondor_system_periodic_hold }})) +SYSTEM_PERIODIC_HOLD_REASON = \ + ifThenElse(((time() - JobStartDate) >= ({{ hcondor_system_periodic_hold }}), \ + "Maximum wallclock time exceeded", \ + "Unspecified reason") +SYSTEM_PERIODIC_REMOVE = \ + (JobStatus == 5 && time() - EnteredCurrentStatus > {{ htcondor_system_periodic_remove }}) +{% endif %} + +# Scheduling +MASTER_UPDATE_INTERVAL = {{ htcondor_master_update_interval }} +CLASSAD_LIFETIME = {{ htcondor_classad_lifetime }} +NEGOTIATOR_INTERVAL = {{ htcondor_negotiator_interval }} +NEGOTIATOR_UPDATE_INTERVAL = {{ htcondor_negotiator_update_interval }} +SCHEDD_INTERVAL = {{ htcondor_schedd_interval }} +JOB_START_COUNT = {{ htcondor_job_start_count }} +JOB_START_DELAY = {{ htcondor_job_start_delay }} +CLAIM_WORKLIFE = {{ htcondor_claim_worklife }} +NEGOTIATOR_POST_JOB_RANK = {{ htcondor_negotiator_post_job_rank }}