diff --git a/cwltool/cuda.py b/cwltool/cuda.py index 719bfd8677..a2847d2337 100644 --- a/cwltool/cuda.py +++ b/cwltool/cuda.py @@ -10,21 +10,37 @@ def cuda_version_and_device_count() -> Tuple[str, int]: """Determine the CUDA version and number of attached CUDA GPUs.""" + # For the number of GPUs, we can use the following query + cmd_count = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"] try: - out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec + out_count = subprocess.check_output(cmd_count) # nosec + except Exception as e: + _logger.warning("Error checking number of GPUs with nvidia-smi: %s", e) + return ("", 0) + count = int(out_count) + + # Since there is no specific query for the cuda version, we have to use + # `nvidia-smi -q -x` + # However, apparently nvidia-smi is not safe to call concurrently. + # With --parallel, sometimes the returned XML will contain + # \xff...\xff + # (or other arbitrary bytes) and xml.dom.minidom.parseString will raise + # "xml.parsers.expat.ExpatError: not well-formed (invalid token)" + # So we either need to fix the process_name tag, or better yet specifically + # `grep cuda_version` + cmd_cuda_version = "nvidia-smi -q -x | grep cuda_version" + try: + out = subprocess.check_output(cmd_cuda_version, shell=True) # nosec except Exception as e: _logger.warning("Error checking CUDA version with nvidia-smi: %s", e) return ("", 0) - dm = xml.dom.minidom.parseString(out) # nosec - ag = dm.getElementsByTagName("attached_gpus") - if len(ag) < 1 or ag[0].firstChild is None: - _logger.warning( - "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s", - out, - ) + try: + dm = xml.dom.minidom.parseString(out) # nosec + except xml.parsers.expat.ExpatError as e: + _logger.warning("Error parsing XML stdout of nvidia-smi: %s", e) + _logger.warning("stdout: %s", out) return ("", 0) - ag_element = ag[0].firstChild cv = dm.getElementsByTagName("cuda_version") if len(cv) < 1 or cv[0].firstChild is None: @@ -35,13 +51,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]: return ("", 0) cv_element = cv[0].firstChild - if isinstance(cv_element, xml.dom.minidom.Text) and isinstance( - ag_element, xml.dom.minidom.Text - ): - return (cv_element.data, int(ag_element.data)) + if isinstance(cv_element, xml.dom.minidom.Text): + return (cv_element.data, count) _logger.warning( - "Error checking CUDA version with nvidia-smi. " - "Either 'attached_gpus' or 'cuda_version' was not a text node: %s", + "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s", out, ) return ("", 0)