Skip to content

Commit

Permalink
Fix run_multi_gpu script multi-gpu issue and refactor code
Browse files Browse the repository at this point in the history
  • Loading branch information
Ruturaj4 committed Jan 15, 2025
1 parent 99d675a commit 5727ba6
Showing 1 changed file with 73 additions and 31 deletions.
104 changes: 73 additions & 31 deletions build/rocm/run_multi_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,81 @@
# See the License for the specific language governing permissions and
# limitations under the License.

set -xu
#!/usr/bin/env bash

set -euxo pipefail

LOG_DIR="./logs"

# --------------------------------------------------------------------------------
# Function to detect number of AMD/ATI GPUs using lspci.
# --------------------------------------------------------------------------------
detect_amd_gpus() {
# Make sure lspci is installed.
if ! command -v lspci &>/dev/null; then
echo "Error: lspci command not found. Aborting."
exit 1
fi
# Count AMD/ATI GPU controllers.
local count
count=$(lspci | grep -c 'controller.*AMD/ATI')
echo "$count"
}

# Function to run tests with specified GPUs
# --------------------------------------------------------------------------------
# Function to run tests with specified GPUs.
# --------------------------------------------------------------------------------
run_tests() {
local base_dir=./logs
local gpu_devices="$1"
export HIP_VISIBLE_DEVICES=$gpu_devices
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 tests/pmap_test.py
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py
python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html

echo "Running tests on GPUs: $gpu_devices"
export HIP_VISIBLE_DEVICES="$gpu_devices"

# Create the log directory if it doesn't exist.
mkdir -p "$LOG_DIR"

python3 -m pytest \
--html="${LOG_DIR}/multi_gpu_pmap_test_log.html" \
--reruns 3 \
tests/pmap_test.py

python3 -m pytest \
--html="${LOG_DIR}/multi_gpu_multi_device_test_log.html" \
--reruns 3 \
tests/multi_device_test.py

# Merge individual HTML reports into one.
python3 -m pytest_html_merger \
-i "$LOG_DIR" \
-o "${LOG_DIR}/final_compiled_report.html"
}

# Check for required commands
if ! command -v lspci &> /dev/null; then
echo "lspci command not found, aborting."
exit 1
fi

if ! command -v python3 &> /dev/null; then
echo "Python3 is not available, aborting."
exit 1
fi

# GPU detection and test execution
gpu_count=$(lspci | grep -c 'controller.*AMD/ATI')
echo "Number of AMD/ATI GPUs detected: $gpu_count"

if [[ $gpu_count -gt 8 ]]; then
run_tests "0,1,2,3,4,5,6,7"
elif [[ $gpu_count -gt 4 ]]; then
run_tests "0,1,2,3"
elif [[ $gpu_count -gt 2 ]]; then
run_tests "0,1"
else
run_tests "0"
fi
# --------------------------------------------------------------------------------
# Main entry point.
# --------------------------------------------------------------------------------
main() {
# Ensure python3 is available.
if ! command -v python3 &>/dev/null; then
echo "Error: Python3 is not available. Aborting."
exit 1
fi

# Detect number of AMD/ATI GPUs.
local gpu_count
gpu_count=$(detect_amd_gpus)
echo "Number of AMD/ATI GPUs detected: $gpu_count"

# Decide how many GPUs to enable based on count.
if [[ "$gpu_count" -ge 8 ]]; then
run_tests "0,1,2,3,4,5,6,7"
elif [[ "$gpu_count" -ge 4 ]]; then
run_tests "0,1,2,3"
elif [[ "$gpu_count" -ge 2 ]]; then
run_tests "0,1"
else
run_tests "0"
fi
}

main "$@"

0 comments on commit 5727ba6

Please sign in to comment.