From 9ac0b854d3bc2fd31a7a71c968a2e22ba7012aa9 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Mon, 11 Dec 2023 13:55:44 -0700 Subject: [PATCH] add split_trace to allow for easy splitting of large traces for parallel processing by gufi_trace2index --- contrib/CMakeLists.txt | 5 + contrib/split_trace.c | 265 +++++++++++++++++++++++++++ test/regression/CMakeLists.txt | 1 + test/regression/setup.sh.in | 2 + test/regression/split_trace.expected | 23 +++ test/regression/split_trace.sh.in | 142 ++++++++++++++ 6 files changed, 438 insertions(+) create mode 100644 contrib/split_trace.c create mode 100644 test/regression/split_trace.expected create mode 100755 test/regression/split_trace.sh.in diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b26f32e12..08c9ffc66 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -67,6 +67,11 @@ add_executable(gendir gendir.c) target_link_libraries(gendir ${COMMON_LIBRARIES}) add_dependencies(gendir GUFI) +# utility for spliting large traces into smaller files for parallel processing +add_executable(split_trace split_trace.c) +target_link_libraries(split_trace ${COMMON_LIBRARIES}) +add_dependencies(split_trace GUFI) + # potentially useful C++ executables if (CMAKE_CXX_COMPILER) # a more complex index generator diff --git a/contrib/split_trace.c b/contrib/split_trace.c new file mode 100644 index 000000000..f24aff55c --- /dev/null +++ b/contrib/split_trace.c @@ -0,0 +1,265 @@ +/* +This file is part of GUFI, which is part of MarFS, which is released +under the BSD license. + + +Copyright (c) 2017, Los Alamos National Security (LANS), LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +From Los Alamos National Security, LLC: +LA-CC-15-039 + +Copyright (c) 2017, Los Alamos National Security, LLC All rights reserved. +Copyright 2017. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is +modified to produce derivative works, such modified software should be +clearly marked, so as not to confuse it with the version available from +LANL. + +THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGE. +*/ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QueuePerThreadPool.h" +#include "utils.h" + +const size_t GETLINE_DEFAULT_SIZE = 256; /* magic number */ + +struct Range { + int fd; + off_t start, end; /* [start, end) */ + size_t id; +}; + +static int copy_range(QPTPool_t *ctx, const size_t id, void *data, void *args) { + (void) ctx; (void) id; + + struct Range *range = (struct Range *) data; + const char *prefix = (char *) args; + + char outname[MAXPATH]; + SNPRINTF(outname, sizeof(outname), "%s.%zu", prefix, range->id); + + int dst = open(outname, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (dst < 0) { + const int err = errno; + fprintf(stderr, "Error: Unable to open file %s to write range [%jd, %jd): %s (%d)\n", + outname, (intmax_t) range->start, (intmax_t) range->end, strerror(err), errno); + free(range); + return 1; + } + + const off_t size = range->end - range->start; + + /* ignore errors */ + ftruncate(dst, size); + + const int rc = (copyfd(range->fd, range->start, dst, 0, size) > -1); + + close(dst); + free(range); + + return rc; +} + +/* + * Find end of current chunk/stanza + * Return: + * -1 - error + * 0 - found end + * 1 - end of file + */ +int find_end(const int fd, const char delim, off_t *end) { + /* Not checking arguments */ + + char *line = NULL; + size_t size = 0; + ssize_t len = 0; + + /* + * drop first line - assume started in middle of line, and + * can't be sure if first delimiter found is actually + * separator for first and second columns + */ + if ((len = getline_fd(&line, &size, fd, end, GETLINE_DEFAULT_SIZE)) < 1) { + free(line); + return len; + } + + while (1) { + const off_t before_read = *end; + + free(line); + line = NULL; + size = 0; + + len = getline_fd(&line, &size, fd, end, GETLINE_DEFAULT_SIZE); + if (len == 0) { + free(line); + return 1; + } + else if (len < 0) { + return -1; + } + + const char *first_delim = memchr(line, delim, len); + + /* no delimiter */ + if (!first_delim) { + fprintf(stderr, "Error: Line at %jd does not have a delimiter\n", (intmax_t) before_read); + free(line); + return -1; + } + + if (first_delim[1] == 'd') { + *end = before_read; + free(line); + return 0; + } + } + + return -1; +} + +int main(int argc, char *argv[]) { + if (argc < 5) { + fprintf(stderr, "Syntax: %s src_trace delim max_split_count dst_trace_prefix [threads]\n", argv[0]); + return 1; + } + + const char *src = argv[1]; + const char delim = argv[2][0]; + size_t count = 0; + char *prefix = argv[4]; + size_t threads = 1; + + if (sscanf(argv[3], "%zu", &count) != 1) { + fprintf(stderr, "Error: Invalid output count: %s\n", argv[2]); + return 1; + } + + if (argc > 5) { + if (sscanf(argv[5], "%zu", &threads) != 1) { + fprintf(stderr, "Error: Invalid thread count: %s\n", argv[5]); + return 1; + } + } + + const int fd = open(src, O_RDONLY); + if (fd < 0) { + const int err = errno; + fprintf(stderr, "Error: Could not open source trace file %s: %s (%d)\n", + src, strerror(err), err); + return 1; + } + + struct stat st; + if (fstat(fd, &st) != 0) { + const int err = errno; + fprintf(stderr, "Could not fstat trace file: %s (%d)\n", + strerror(err), err); + close(fd); + return 1; + } + + QPTPool_t *pool = QPTPool_init(threads, prefix); + if (QPTPool_start(pool) != 0) { + fprintf(stderr, "Error: Failed to start thread pool\n"); + QPTPool_destroy(pool); + close(fd); + return 1; + } + + /* approximate number of bytes per chunk */ + const size_t jump = (st.st_size / count) + !!(st.st_size % count); + + size_t id = 0; + off_t start = 0; + off_t end = start + jump; + + int rc = 0; + while ((start < st.st_size) && + ((rc = find_end(fd, delim, &end)) == 0)) { + struct Range *range = malloc(sizeof(*range)); + range->fd = fd; + range->start = start; + range->end = end; + range->id = id++; + + QPTPool_enqueue(pool, 0, copy_range, range); + + /* jump to next chunk */ + start = end; + end += jump; + } + + /* need this because last chunk will error out of loop */ + if ((rc > -1) && (start < st.st_size)) { + struct Range *range = malloc(sizeof(*range)); + range->fd = fd; + range->start = start; + range->end = st.st_size; + range->id = id++; + + QPTPool_enqueue(pool, 0, copy_range, range); + } + + QPTPool_wait(pool); + QPTPool_destroy(pool); + + close(fd); + + /* error or no trace files created */ + return (rc < 0) || (id == 0); +} diff --git a/test/regression/CMakeLists.txt b/test/regression/CMakeLists.txt index 2c2031af5..bcae8aedf 100644 --- a/test/regression/CMakeLists.txt +++ b/test/regression/CMakeLists.txt @@ -107,6 +107,7 @@ set(EXAMPLES set(OTHERS bash_completion + split_trace ) if (ZLIB_FOUND) diff --git a/test/regression/setup.sh.in b/test/regression/setup.sh.in index f3fa683e3..eb300c911 100755 --- a/test/regression/setup.sh.in +++ b/test/regression/setup.sh.in @@ -167,6 +167,7 @@ GUFI_TREESUMMARY_ALL="@CMAKE_BINARY_DIR@/src/gufi_treesummary_all" GUFI_UNROLLUP="@CMAKE_BINARY_DIR@/src/gufi_unrollup" OLDBIGFILES="@CMAKE_BINARY_DIR@/examples/oldbigfiles" QUERYDBS="@CMAKE_BINARY_DIR@/src/querydbs" +SPLIT_TRACE="@CMAKE_BINARY_DIR@/contrib/split_trace" USERFILESPACEHOG="@CMAKE_BINARY_DIR@/examples/userfilespacehog" USERFILESPACEHOGUSESUMMARY="@CMAKE_BINARY_DIR@/examples/userfilespacehogusesummary" VERIFYTRACE="@CMAKE_BINARY_DIR@/contrib/verifytrace" @@ -203,6 +204,7 @@ replace() { s/${GUFI_UNROLLUP//\//\\/}/gufi_unrollup/g; s/${OLDBIGFILES//\//\\/}/oldbigfiles/g; s/${QUERYDBS//\//\\/}/querydbs/g; + s/${SPLIT_TRACE//\//\\/}/split_trace/g; s/${USERFILESPACEHOG//\//\\/}/userfilespacehog/g; s/${USERFILESPACEHOGUSESUMMARY//\//\\/}/userfilespacehogusesummary/g; s/${VERIFYTRACE//\//\\/}/verifytrace/g; diff --git a/test/regression/split_trace.expected b/test/regression/split_trace.expected new file mode 100644 index 000000000..6db929a47 --- /dev/null +++ b/test/regression/split_trace.expected @@ -0,0 +1,23 @@ +# split_trace help +$ split_trace +Syntax: split_trace src_trace delim max_split_count dst_trace_prefix [threads] + +Split empty trace file +$ "split_trace" "trace.0" "|" "4" "split" || true + +Expecting 0 trace files. Found 0. + +$ gufi_dir2trace -d "|" -n 1 -x "prefix" "trace" +"trace.0" Already exists! +Creating GUFI Traces trace with 1 threads +Total Dirs: 5 +Total Files: 14 + +$ split_trace "trace.0" "|" 4 "split" + +$ diff "trace.0" "split" + +# Split non-existant trace +$ split_trace "badtrace" "|" 4 "split" || true +Error: Could not open source trace file badtrace: No such file or directory (2) + diff --git a/test/regression/split_trace.sh.in b/test/regression/split_trace.sh.in new file mode 100755 index 000000000..bc2919656 --- /dev/null +++ b/test/regression/split_trace.sh.in @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +# This file is part of GUFI, which is part of MarFS, which is released +# under the BSD license. +# +# +# Copyright (c) 2017, Los Alamos National Security (LANS), LLC +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# +# From Los Alamos National Security, LLC: +# LA-CC-15-039 +# +# Copyright (c) 2017, Los Alamos National Security, LLC All rights reserved. +# Copyright 2017. Los Alamos National Security, LLC. This software was produced +# under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +# Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +# the U.S. Department of Energy. The U.S. Government has rights to use, +# reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +# ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +# ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is +# modified to produce derivative works, such modified software should be +# clearly marked, so as not to confuse it with the version available from +# LANL. +# +# THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL SECURITY, LLC OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +# OF SUCH DAMAGE. + + + +set -e +source @CMAKE_CURRENT_BINARY_DIR@/setup.sh 0 + +OUTPUT="split_trace.out" + +# shellcheck disable=SC2153 +TRACE0="${TRACE}.0" +DELIM="|" + +SPLIT_PREFIX="split" +SPLIT_COUNT=4 +SPLITS=() +for ((i = 0; i < SPLIT_COUNT; i++)) +do + SPLITS+=("${SPLIT_PREFIX}.${i}") +done + + +cleanup() { + rm -rf "${TRACE0}" "${SPLIT_PREFIX}" "${SPLITS[@]}" +} + +cleanup_exit() { + cleanup + setup_cleanup +} + +trap cleanup_exit EXIT + +cleanup + +( +echo "# ${SPLIT_TRACE} help" | replace +run_no_sort "${SPLIT_TRACE}" + +# ################################# +# test splitting empty trace file +rm -f "${TRACE0}" +touch "${TRACE0}" + +echo "Split empty trace file" +run_no_sort "\"${SPLIT_TRACE}\" \"${TRACE0}\" \"${DELIM}\" \"${SPLIT_COUNT}\" \"${SPLIT_PREFIX}\" || true" + +found=$(find "${SPLITS[@]}" 2> /dev/null | wc -l) +echo "Expecting 0 trace files. Found ${found}." +echo +# ################################# + +# generate the tree +@CMAKE_CURRENT_BINARY_DIR@/generatetree.sh "${SRCDIR}" + +# generate a single trace +run_no_sort "${GUFI_DIR2TRACE} -d \"${DELIM}\" -n 1 -x \"${SRCDIR}\" \"${TRACE}\"" + +# split trace +run_no_sort "${SPLIT_TRACE} \"${TRACE0}\" \"${DELIM}\" ${SPLIT_COUNT} \"${SPLIT_PREFIX}\"" | replace + +# there might be fewer files than requested +splits=() +while IFS= read -r -d $'\0' +do + splits+=("${REPLY}") +done < <(find "${SPLITS[@]}" -print0 || true) + +# combine files +for split in "${splits[@]}" +do + cat "${split}" +done > "${SPLIT_PREFIX}" + +# make sure there are no differences +run_no_sort "@DIFF@ \"${TRACE0}\" \"${SPLIT_PREFIX}\"" + +echo "# Split non-existant trace" +run_no_sort "${SPLIT_TRACE} \"${BADTRACE}\" \"${DELIM}\" ${SPLIT_COUNT} \"${SPLIT_PREFIX}\" || true" | replace +) | remove_indexing_time | tee "${OUTPUT}" + +@DIFF@ @CMAKE_CURRENT_BINARY_DIR@/split_trace.expected "${OUTPUT}" +rm "${OUTPUT}"