-
-
Notifications
You must be signed in to change notification settings - Fork 231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Filter out malformed nvidia-smi process_name XML tag #1910
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,23 +8,48 @@ | |
from .utils import CWLObjectType | ||
|
||
|
||
def cuda_device_count() -> str: | ||
"""Determine the number of attached CUDA GPUs.""" | ||
# For the number of GPUs, we can use the following query | ||
cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"] | ||
try: | ||
# This is equivalent to subprocess.check_output, but use | ||
# subprocess.run so we can use separate MagicMocks in test_cuda.py | ||
proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec | ||
except Exception as e: | ||
_logger.warning("Error checking number of GPUs with nvidia-smi: %s", e) | ||
return "0" | ||
# NOTE: On a machine with N GPUs the query return N lines, each containing N. | ||
return proc.stdout.decode("utf-8").split("\n")[0] | ||
|
||
|
||
def cuda_version_and_device_count() -> Tuple[str, int]: | ||
"""Determine the CUDA version and number of attached CUDA GPUs.""" | ||
count = int(cuda_device_count()) | ||
|
||
# Since there is no specific query for the cuda version, we have to use | ||
# `nvidia-smi -q -x` | ||
# However, apparently nvidia-smi is not safe to call concurrently. | ||
# With --parallel, sometimes the returned XML will contain | ||
# <process_name>\xff...\xff</process_name> | ||
# (or other arbitrary bytes) and xml.dom.minidom.parseString will raise | ||
# "xml.parsers.expat.ExpatError: not well-formed (invalid token)" | ||
# So we either need to use `grep -v process_name` to blacklist that tag, | ||
# (and hope that no other tags cause problems in the future) | ||
# or better yet use `grep cuda_version` to only grab the tags we will use. | ||
cmd = "nvidia-smi -q -x | grep cuda_version" | ||
try: | ||
out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec | ||
out = subprocess.check_output(cmd, shell=True) # nosec | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer that instead of using a subshell it captures the output and does the substring search in Python. For a simple substring search this really is as simple as something like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually instead of using minidom, to pull a single tag it would be easier to just use a regular expression to look for |
||
except Exception as e: | ||
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e) | ||
return ("", 0) | ||
dm = xml.dom.minidom.parseString(out) # nosec | ||
|
||
ag = dm.getElementsByTagName("attached_gpus") | ||
if len(ag) < 1 or ag[0].firstChild is None: | ||
_logger.warning( | ||
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s", | ||
out, | ||
) | ||
try: | ||
dm = xml.dom.minidom.parseString(out) # nosec | ||
except xml.parsers.expat.ExpatError as e: | ||
_logger.warning("Error parsing XML stdout of nvidia-smi: %s", e) | ||
_logger.warning("stdout: %s", out) | ||
return ("", 0) | ||
ag_element = ag[0].firstChild | ||
|
||
cv = dm.getElementsByTagName("cuda_version") | ||
if len(cv) < 1 or cv[0].firstChild is None: | ||
|
@@ -35,13 +60,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]: | |
return ("", 0) | ||
cv_element = cv[0].firstChild | ||
|
||
if isinstance(cv_element, xml.dom.minidom.Text) and isinstance( | ||
ag_element, xml.dom.minidom.Text | ||
): | ||
return (cv_element.data, int(ag_element.data)) | ||
if isinstance(cv_element, xml.dom.minidom.Text): | ||
return (cv_element.data, count) | ||
_logger.warning( | ||
"Error checking CUDA version with nvidia-smi. " | ||
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s", | ||
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s", | ||
out, | ||
) | ||
return ("", 0) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and update the
return
as well