Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter out malformed nvidia-smi process_name XML tag #1910

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 37 additions & 15 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,48 @@
from .utils import CWLObjectType


def cuda_device_count() -> str:
Copy link
Member

@mr-c mr-c Oct 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def cuda_device_count() -> str:
def cuda_device_count() -> int:

and update the return as well

"""Determine the number of attached CUDA GPUs."""
# For the number of GPUs, we can use the following query
cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"]
try:
# This is equivalent to subprocess.check_output, but use
# subprocess.run so we can use separate MagicMocks in test_cuda.py
proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec
except Exception as e:
_logger.warning("Error checking number of GPUs with nvidia-smi: %s", e)
return "0"
# NOTE: On a machine with N GPUs the query return N lines, each containing N.
return proc.stdout.decode("utf-8").split("\n")[0]


def cuda_version_and_device_count() -> Tuple[str, int]:
"""Determine the CUDA version and number of attached CUDA GPUs."""
count = int(cuda_device_count())

# Since there is no specific query for the cuda version, we have to use
# `nvidia-smi -q -x`
# However, apparently nvidia-smi is not safe to call concurrently.
# With --parallel, sometimes the returned XML will contain
# <process_name>\xff...\xff</process_name>
# (or other arbitrary bytes) and xml.dom.minidom.parseString will raise
# "xml.parsers.expat.ExpatError: not well-formed (invalid token)"
# So we either need to use `grep -v process_name` to blacklist that tag,
# (and hope that no other tags cause problems in the future)
# or better yet use `grep cuda_version` to only grab the tags we will use.
cmd = "nvidia-smi -q -x | grep cuda_version"
try:
out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec
out = subprocess.check_output(cmd, shell=True) # nosec
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer that instead of using a subshell it captures the output and does the substring search in Python. For a simple substring search this really is as simple as something like:

out = subprocess.check_output(["nvidia-smi", "-q", "-x"])
if "cuda_version" not in out:
   raise ...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually instead of using minidom, to pull a single tag it would be easier to just use a regular expression to look for <cuda_version>.

except Exception as e:
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
return ("", 0)
dm = xml.dom.minidom.parseString(out) # nosec

ag = dm.getElementsByTagName("attached_gpus")
if len(ag) < 1 or ag[0].firstChild is None:
_logger.warning(
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
out,
)
try:
dm = xml.dom.minidom.parseString(out) # nosec
except xml.parsers.expat.ExpatError as e:
_logger.warning("Error parsing XML stdout of nvidia-smi: %s", e)
_logger.warning("stdout: %s", out)
return ("", 0)
ag_element = ag[0].firstChild

cv = dm.getElementsByTagName("cuda_version")
if len(cv) < 1 or cv[0].firstChild is None:
Expand All @@ -35,13 +60,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return ("", 0)
cv_element = cv[0].firstChild

if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
ag_element, xml.dom.minidom.Text
):
return (cv_element.data, int(ag_element.data))
if isinstance(cv_element, xml.dom.minidom.Text):
return (cv_element.data, count)
_logger.warning(
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s",
out,
)
return ("", 0)
Expand Down
98 changes: 28 additions & 70 deletions tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder:


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
def test_cuda_job_setup_check(
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
Expand All @@ -101,74 +104,43 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
def test_cuda_job_setup_check_err(
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
) -> None:
runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus></attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
def test_cuda_job_setup_check_err_missing_query_gpu_count(
makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
count.return_value = ""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
in caplog.text
)
assert "invalid literal for int() with base 10" in caplog.text


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_empty_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
Expand All @@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version(


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_missing_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version(
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
</nvidia>
"""
count.return_value = "1"
check_output.return_value = ""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
in caplog.text
)
assert "Error parsing XML stdout of nvidia-smi" in caplog.text


@mock.patch("subprocess.check_output")
@mock.patch("cwltool.cuda.cuda_device_count")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
) -> None:
runtime_context = RuntimeContext({})

Expand All @@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version(
}
builder = _makebuilder(cudaReq)

count.return_value = "1"
check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version><subelement /></cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)
assert (
"Error checking CUDA version with nvidia-smi. "
"Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node"
in caplog.text
)


Expand Down
2 changes: 1 addition & 1 deletion tests/wf/nvidia-smi-container.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requirements:
cudaVersionMin: "1.0"
cudaComputeCapability: "1.0"
DockerRequirement:
dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04"
inputs: []
outputs: []
# Assume this will exit non-zero (resulting in a failing test case) if
Expand Down