diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..2c95785f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,103 @@ +name: Build and Release (on GitHub only) + +on: + # when a push is made to the main branch (like when a pull request is merged, or something is pushed directly) + workflow_dispatch: + push: + branches: [ "main" ] + +env: + BUILD_TYPE: Release + +jobs: + + set-outputs: + runs-on: ubuntu-latest + outputs: + short_sha: ${{ steps.vars.outputs.short_sha }} + steps: + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Calculate short_sha + id: vars + run: echo "short_sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + build-and-upload: + + runs-on: ubuntu-latest + needs: set-outputs + container: + image: gocartio/cartogram-web:latest + steps: + + - name: Install Dependencies + run: | + apt update -y + apt install -y git g++-11 build-essential cmake libboost-all-dev + + - name: Checkout Repository + uses: actions/checkout@v4 + with: + submodules: 'recursive' + fetch-depth: 0 + + - name: Configure CMake + run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DRELEASE_TAG=${{ needs.set-outputs.outputs.short_sha }} + + - name: Build + run: | + cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} --target install -j4 -- + + # - name: Run CTest + # working-directory: ${{github.workspace}}/build + # # Execute tests defined by the CMake configuration. + # # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + # run: ctest -C ${{env.BUILD_TYPE}} + + # - name: Run Stress Test + # run: | + # sudo make install -C build + # cd tests/ + # chmod +x stress_test.sh + # bash stress_test.sh + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cartogram + path: /usr/local/bin/cartogram + + release: + + runs-on: ubuntu-latest + needs: [build-and-upload, set-outputs] + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Download Artifact + uses: actions/download-artifact@v4 + with: + name: cartogram + + - name: Push tag + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git tag ${{ needs.set-outputs.outputs.short_sha }} + git push origin ${{ needs.set-outputs.outputs.short_sha }} + + - name: Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ needs.set-outputs.outputs.short_sha }} + files: cartogram + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7f46a652..82e5392b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,6 @@ cartogram /*.geojson /sample_data/*.geojson -# Ignore files generated by Visual Studio Code -.vscode - # Ignore DS_Store files created by macOS **/.DS_Store **/.cache* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..00639f9e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "external/cgal"] + path = external/cgal + url = https://github.com/CGAL/cgal.git diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..7cbcc373 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,78 @@ +{ + "files.associations": { + "__config": "cpp", + "__verbose_abort": "cpp", + "array": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdlib": "cpp", + "initializer_list": "cpp", + "limits": "cpp", + "numbers": "cpp", + "concepts": "cpp", + "algorithm": "cpp", + "type_traits": "cpp", + "__hash_table": "cpp", + "__split_buffer": "cpp", + "__tree": "cpp", + "atomic": "cpp", + "deque": "cpp", + "hash_map": "cpp", + "forward_list": "cpp", + "ios": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "system_error": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "*.tcc": "cpp", + "any": "cpp", + "cmath": "cpp", + "cstdio": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "bit": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "compare": "cpp", + "exception": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "tuple": "cpp", + "utility": "cpp", + "format": "cpp", + "fstream": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "new": "cpp", + "ostream": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "__bit_reference": "cpp", + "__threading_support": "cpp", + "execution": "cpp" + }, + "files.exclude": { + "**/external": true + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index bb37aa11..d01244f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,9 @@ -cmake_minimum_required(VERSION 3.27) +cmake_minimum_required(VERSION 3.25) + +if(UNIX AND NOT APPLE) + set(CMAKE_CXX_COMPILER "g++-11") +endif() + project(cartogram LANGUAGES CXX) # ========== Project Setup ========== @@ -7,6 +12,9 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) set(CMAKE_COLOR_DIAGNOSTICS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +# Assume development build by default +set(RELEASE_TAG "development" CACHE STRING "Release tag for the build") + # Default build type if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) @@ -14,45 +22,51 @@ endif() # ========== Dependencies Setup ========== +# Direct CMake to local CGAL installation +set(CGAL_DIR ${PROJECT_SOURCE_DIR}/external/cgal) +find_package(CGAL REQUIRED) + # Boost find_package(Boost REQUIRED COMPONENTS unit_test_framework) -# Matplot++ -find_package(Matplot++ REQUIRED) - # PkgConfig, fftw, and cairo find_package(PkgConfig REQUIRED) pkg_search_module(fftw REQUIRED fftw3 IMPORTED_TARGET) -pkg_search_module(CAIRO REQUIRED CAIRO IMPORTED_TARGET) +pkg_search_module(cairo REQUIRED cairo IMPORTED_TARGET) # ========== Source Files ========== file(GLOB_RECURSE CARTOGRAM_SOURCES "src/*.cpp") add_executable(cartogram ${CARTOGRAM_SOURCES}) +target_compile_definitions(cartogram PRIVATE RELEASE_TAG="${RELEASE_TAG}") + # ========== Include Directories ========== -target_include_directories(cartogram PUBLIC - ${PROJECT_SOURCE_DIR}/include - ${Boost_INCLUDE_DIRS} - PkgConfig::fftw - PkgConfig::CAIRO +target_include_directories(cartogram + PUBLIC + ${PROJECT_SOURCE_DIR}/include +) + +target_include_directories(cartogram + SYSTEM PUBLIC + ${CGAL_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} + ${PROJECT_SOURCE_DIR}/external ) # ========== Compile Options ========== -if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - target_compile_options(cartogram PRIVATE -isystem ${Boost_INCLUDE_DIRS}) - target_compile_options(cartogram PRIVATE -ffp-contract=off) -elseif(MSVC) - target_compile_options(cartogram PRIVATE /external:I ${Boost_INCLUDE_DIRS}) -endif() +target_compile_options(cartogram PRIVATE -ffp-contract=off) # Compiler warnings -target_compile_options(cartogram PRIVATE -Wall -Wextra -pedantic -Wno-deprecated-declarations) +target_compile_options(cartogram PRIVATE + -Wall # Enable all warnings + -Wextra # Enable extra warnings + -Wpedantic # Enable pedantic warnings +) # ========== Linking Libraries ========== target_link_libraries(cartogram - PkgConfig::fftw - PkgConfig::CAIRO - Matplot++::matplot + PkgConfig::fftw + PkgConfig::cairo ) # ========== Installation ========== @@ -80,18 +94,18 @@ foreach(TEST_FILE ${TEST_FILES}) add_executable(${TEST_NAME} ${TEST_FILE} ${CARTOGRAM_TEST_SOURCES_FROM_SRC}) # Include directories for the test executable - target_include_directories(${TEST_NAME} PUBLIC - ${PROJECT_SOURCE_DIR}/include - ${Boost_INCLUDE_DIRS} - PkgConfig::fftw + target_include_directories(${TEST_NAME} + PUBLIC + ${PROJECT_SOURCE_DIR}/include + ) + target_include_directories(${TEST_NAME} + SYSTEM PUBLIC + ${CGAL_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS} + ${PROJECT_SOURCE_DIR}/external ) - if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - target_compile_options(${TEST_NAME} PRIVATE -isystem ${Boost_INCLUDE_DIRS}) - target_compile_options(${TEST_NAME} PRIVATE -ffp-contract=off) - elseif(MSVC) - target_compile_options(${TEST_NAME} PRIVATE /external:I ${Boost_INCLUDE_DIRS}) - endif() + target_compile_options(${TEST_NAME} PRIVATE -ffp-contract=off) # Compiler warnings for the test executable target_compile_options(${TEST_NAME} PRIVATE -Wall -Wextra -pedantic -Wno-deprecated-declarations) @@ -113,4 +127,4 @@ add_custom_command( POST_BUILD COMMENT "Uninstalling cartogram..." COMMAND xargs rm -vf < install_manifest.txt || echo "Nothing in install_manifest.txt to be uninstalled!" -) +) \ No newline at end of file diff --git a/README.md b/README.md index cf31a6b3..f8238621 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,14 @@ Gastner MT, Seguy V, More P. _Fast flow-based algorithm for creating density-equ Data produced by code in this repository are subject to the MIT license found [here](./LICENSE) and should cite the aforementioned paper by Gastner et al. (2018). +While cloning this repository, please ensure you use the `--recurse-submodules` flag like so: +- + git clone --recurse-submodules https://github.com/mgastner/cartogram-cpp.git + ## Dependencies +Please note, we only support UNIX-based systems, and have only tested on macOS, Linux, and GNU. + ### macOS #### Installing Homebrew @@ -22,25 +28,17 @@ Install [homebrew](brew.sh) by running the following command: #### Installing dependencies through Homebrew -Install llvm, pkg-config, boost, fftw, cgal, nlohmann-json, and cmake by running the following command: +Install pkg-config, boost, fftw, nlohmann-json, and cmake by running the following command: - brew install llvm@17 libomp pkg-config boost fftw cgal nlohmann-json cmake cairo matplotplusplus + brew install libomp pkg-config boost fftw nlohmann-json cmake cairo ### Debian-based distributions (Ubuntu, Arch Linux etc.) -#### Installing GNU g++-13 - -Run the following commands to install it: - - sudo apt install build-essential manpages-dev software-properties-common - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt update && sudo apt install gcc-13 g++-13 - -#### Installing dependencies through apt +#### Installing relevant dependencies through apt: -Install nlohmann-json, cgal, openmp, fftw3, cairo, matplot++, boost, and cmake by running the following command: +Have a look through to apt-requirements.txt if you'd like to see what all will be installed. Then, run the following commands to install all dependencies through apt: - sudo apt install nlohmann-json3-dev libcgal-dev libomp-dev libfftw3-dev libcairo2-dev libmatplot++-dev libboost-all-dev cmake + apt install -y g++-11 build-essential cmake libboost-all-dev nlohmann-json3-dev libomp-dev libfftw3-dev libcairo2-dev ### Installation @@ -62,6 +60,7 @@ Using lesser cores than you have is recommended so that your computer still has - If running `cmake -B build` gives you an error, it is likely that a dependency was not installed correctly. Rerun the appropriate commands above to install the required dependencies and try again. - If you get an error which mentions permission issues, try running the command that gave you the error with `sudo` prefixed, as done with `sudo make install -C build` above. - If `cmake` complains that it could not find a particular library, please try uninstalling it and installing it again. After reinstalling it, please also unlink it and link it with the `--force` flag. +- If you get errors related to CGAL, it's likely you have another version of CGAL installed on your computer that is getting chosen instead of the one contained as a submodule within this repository. It's also possible that when cloning this repository, the `--recurse-submodule` flag was missing. Try running `git submodule init` and `git submodule update` in the root directory of the repository. ### Usage diff --git a/include/argparse.hpp b/external/argparse.hpp similarity index 100% rename from include/argparse.hpp rename to external/argparse.hpp diff --git a/external/cgal b/external/cgal new file mode 160000 index 00000000..188e51ba --- /dev/null +++ b/external/cgal @@ -0,0 +1 @@ +Subproject commit 188e51bad36ffc30e49dbabda29620b71a84664c diff --git a/src/cartogram_info/csv.hpp b/external/csv.hpp similarity index 80% rename from src/cartogram_info/csv.hpp rename to external/csv.hpp index b5415978..811c8e14 100644 --- a/src/cartogram_info/csv.hpp +++ b/external/csv.hpp @@ -1,11 +1,11 @@ #pragma once /* -CSV for C++, version 2.0.0, beta +CSV for C++, version 2.3.0 https://github.com/vincentlaucsb/csv-parser MIT License -Copyright (c) 2017-2020 Vincent La +Copyright (c) 2017-2024 Vincent La Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,12 +34,14 @@ SOFTWARE. */ +#include #include +#include #include #include -#include #include -#include +#include +#include #include #include @@ -410,7 +412,7 @@ struct basic_mmap * handle (which is closed when the object destructs or `unmap` is called), which is * then used to memory map the requested region. Upon failure, `error` is set to * indicate the reason and the object remains in an unmapped state. - * + * * The entire file is mapped. */ template @@ -449,7 +451,7 @@ struct basic_mmap * `handle`, which must be a valid file handle, which is used to memory map the * requested region. Upon failure, `error` is set to indicate the reason and the * object remains in an unmapped state. - * + * * The entire file is mapped. */ void map(const handle_type handle, std::error_code& error) @@ -1795,101 +1797,54 @@ using shared_ummap_sink = basic_shared_mmap_sink; #endif // MIO_SHARED_MMAP_HEADER /** @file - * Defines CSV global constants + * @brief Contains the main CSV parsing algorithm and various utility functions */ -#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include -#if defined(_WIN32) -#include -#define WIN32_LEAN_AND_MEAN -#undef max -#undef min -#elif defined(__linux__) -#include -#endif - -namespace csv { - namespace internals { - // PAGE_SIZE macro could be already defined by the host system. - #if defined(PAGE_SIZE) - #undef PAGE_SIZE - #endif - - // Get operating system specific details - #if defined(_WIN32) - inline int getpagesize() { - _SYSTEM_INFO sys_info = {}; - GetSystemInfo(&sys_info); - return sys_info.dwPageSize; - } - - /** Size of a memory page in bytes */ - const int PAGE_SIZE = getpagesize(); - - /** Returns the amount of available mmory */ - inline unsigned long long get_available_memory() - { - MEMORYSTATUSEX status; - status.dwLength = sizeof(status); - GlobalMemoryStatusEx(&status); - return status.ullAvailPhys; - } - #elif defined(__linux__) - // To be defined - inline unsigned long long get_available_memory() { - return 0; - } - - const int PAGE_SIZE = getpagesize(); - #else - // To be defined - inline unsigned long long get_available_memory() { - return 0; - } - - const int PAGE_SIZE = 4096; - #endif - - /** For functions that lazy load a large CSV, this determines how - * many bytes are read at a time - */ - constexpr size_t ITERATION_CHUNK_SIZE = 250000000; // 250MB - - // TODO: Move to another header file - template - inline bool is_equal(T a, T b, T epsilon = 0.001) { - /** Returns true if two floating point values are about the same */ - static_assert(std::is_floating_point::value, "T must be a floating point type."); - return std::abs(a - b) < epsilon; - } - } - - /** Integer indicating a requested column wasn't found. */ - constexpr int CSV_NOT_FOUND = -1; +#include +#include +#include +#include - /** Used for counting number of rows */ - using RowCount = long long int; -} /** @file - * @brief Implements data type parsing functionality + * A standalone header file containing shared code */ +#include +#include #include -#include -#include -#include +#include +#include -/** @file - * Defines various compatibility macros - */ +#if defined(_WIN32) +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include +# undef max +# undef min +#elif defined(__linux__) +# include +#endif -/** Helper macro which should be #defined as "inline" - * in the single header version - */ + /** Helper macro which should be #defined as "inline" + * in the single header version + */ #define CSV_INLINE inline +#include + // Copyright 2017-2019 by Martin Moene // // string-view lite, a C++17-like string_view for C++98 and later. @@ -3211,9 +3166,9 @@ nssv_RESTORE_WARNINGS() #endif // NONSTD_SV_LITE_H_INCLUDED -// If there is another version of Hedley, then the newer one -// takes precedence. -// See: https://github.com/nemequ/hedley + // If there is another version of Hedley, then the newer one + // takes precedence. + // See: https://github.com/nemequ/hedley /* Hedley - https://nemequ.github.io/hedley * Created by Evan Nemerson * @@ -4722,6 +4677,9 @@ HEDLEY_DIAGNOSTIC_POP namespace csv { +#ifdef _MSC_VER +#pragma region Compatibility Macros +#endif /** * @def IF_CONSTEXPR * Expands to `if constexpr` in C++17 and `if` otherwise @@ -4731,396 +4689,213 @@ namespace csv { * Mainly used for global variables. * * @def CONSTEXPR - * Expands to `constexpr` in C++17 and `inline` otherwise. + * Expands to `constexpr` in decent compilers and `inline` otherwise. * Intended for functions and methods. */ - #if CMAKE_CXX_STANDARD == 17 || __cplusplus >= 201703L - #define CSV_HAS_CXX17 - #endif +#define STATIC_ASSERT(x) static_assert(x, "Assertion failed") - #ifdef CSV_HAS_CXX17 - #include - /** @typedef string_view - * The string_view class used by this library. - */ - using string_view = std::string_view; - #else - /** @typedef string_view - * The string_view class used by this library. - */ - using string_view = nonstd::string_view; - #endif +#if CMAKE_CXX_STANDARD == 17 || __cplusplus >= 201703L +#define CSV_HAS_CXX17 +#endif - #ifdef CSV_HAS_CXX17 - #define IF_CONSTEXPR if constexpr - #define CONSTEXPR_VALUE constexpr - #else - #define IF_CONSTEXPR if - #define CONSTEXPR_VALUE const - #endif +#if CMAKE_CXX_STANDARD >= 14 || __cplusplus >= 201402L +#define CSV_HAS_CXX14 +#endif + +#ifdef CSV_HAS_CXX17 +#include + /** @typedef string_view + * The string_view class used by this library. + */ + using string_view = std::string_view; +#else + /** @typedef string_view + * The string_view class used by this library. + */ + using string_view = nonstd::string_view; +#endif + +#ifdef CSV_HAS_CXX17 + #define IF_CONSTEXPR if constexpr + #define CONSTEXPR_VALUE constexpr + + #define CONSTEXPR_17 constexpr +#else + #define IF_CONSTEXPR if + #define CONSTEXPR_VALUE const + + #define CONSTEXPR_17 inline +#endif + +#ifdef CSV_HAS_CXX14 + template + using enable_if_t = std::enable_if_t; + + #define CONSTEXPR_14 constexpr + #define CONSTEXPR_VALUE_14 constexpr +#else + template + using enable_if_t = typename std::enable_if::type; + + #define CONSTEXPR_14 inline + #define CONSTEXPR_VALUE_14 const +#endif // Resolves g++ bug with regard to constexpr methods - #if defined __GNUC__ && !defined __clang__ - #if __GNUC__ >= 7 - #if defined(CSV_HAS_CXX17) && (__GNUC_MINOR__ >= 2 || __GNUC__ >= 8) - #define CONSTEXPR constexpr - #endif - #endif + // See: https://stackoverflow.com/questions/36489369/constexpr-non-static-member-function-with-non-constexpr-constructor-gcc-clang-d +#if defined __GNUC__ && !defined __clang__ + #if (__GNUC__ >= 7 &&__GNUC_MINOR__ >= 2) || (__GNUC__ >= 8) + #define CONSTEXPR constexpr + #endif #else #ifdef CSV_HAS_CXX17 - #define CONSTEXPR constexpr - #endif - #endif - - #ifndef CONSTEXPR - #define CONSTEXPR inline + #define CONSTEXPR constexpr #endif -} - +#endif -namespace csv { - /** Enumerates the different CSV field types that are - * recognized by this library - * - * @note Overflowing integers will be stored and classified as doubles. - * @note Unlike previous releases, integer enums here are platform agnostic. - */ - enum class DataType { - UNKNOWN = -1, - CSV_NULL, /**< Empty string */ - CSV_STRING, /**< Non-numeric string */ - CSV_INT8, /**< 8-bit integer */ - CSV_INT16, /**< 16-bit integer (short on MSVC/GCC) */ - CSV_INT32, /**< 32-bit integer (int on MSVC/GCC) */ - CSV_INT64, /**< 64-bit integer (long long on MSVC/GCC) */ - CSV_DOUBLE /**< Floating point value */ - }; +#ifndef CONSTEXPR +#define CONSTEXPR inline +#endif - static_assert(DataType::CSV_STRING < DataType::CSV_INT8, "String type should come before numeric types."); - static_assert(DataType::CSV_INT8 < DataType::CSV_INT64, "Smaller integer types should come before larger integer types."); - static_assert(DataType::CSV_INT64 < DataType::CSV_DOUBLE, "Integer types should come before floating point value types."); +#ifdef _MSC_VER +#pragma endregion +#endif namespace internals { - /** Compute 10 to the power of n */ - template - HEDLEY_CONST CONSTEXPR - long double pow10(const T& n) noexcept { - long double multiplicand = n > 0 ? 10 : 0.1, - ret = 1; - - // Make all numbers positive - T iterations = n > 0 ? n : -n; - - for (T i = 0; i < iterations; i++) { - ret *= multiplicand; - } + // PAGE_SIZE macro could be already defined by the host system. +#if defined(PAGE_SIZE) +#undef PAGE_SIZE +#endif - return ret; +// Get operating system specific details +#if defined(_WIN32) + inline int getpagesize() { + _SYSTEM_INFO sys_info = {}; + GetSystemInfo(&sys_info); + return std::max(sys_info.dwPageSize, sys_info.dwAllocationGranularity); } - /** Compute 10 to the power of n */ - template<> - HEDLEY_CONST CONSTEXPR - long double pow10(const unsigned& n) noexcept { - long double multiplicand = n > 0 ? 10 : 0.1, - ret = 1; + const int PAGE_SIZE = getpagesize(); +#elif defined(__linux__) + const int PAGE_SIZE = getpagesize(); +#else + /** Size of a memory page in bytes. Used by + * csv::internals::CSVFieldArray when allocating blocks. + */ + const int PAGE_SIZE = 4096; +#endif - for (unsigned i = 0; i < n; i++) { - ret *= multiplicand; - } + /** For functions that lazy load a large CSV, this determines how + * many bytes are read at a time + */ + constexpr size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB - return ret; + template + inline bool is_equal(T a, T b, T epsilon = 0.001) { + /** Returns true if two floating point values are about the same */ + static_assert(std::is_floating_point::value, "T must be a floating point type."); + return std::abs(a - b) < epsilon; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS - /** Private site-indexed array mapping byte sizes to an integer size enum */ - constexpr DataType int_type_arr[8] = { - DataType::CSV_INT8, // 1 - DataType::CSV_INT16, // 2 - DataType::UNKNOWN, - DataType::CSV_INT32, // 4 - DataType::UNKNOWN, - DataType::UNKNOWN, - DataType::UNKNOWN, - DataType::CSV_INT64 // 8 + /** @typedef ParseFlags + * An enum used for describing the significance of each character + * with respect to CSV parsing + * + * @see quote_escape_flag + */ + enum class ParseFlags { + QUOTE_ESCAPE_QUOTE = 0, /**< A quote inside or terminating a quote_escaped field */ + QUOTE = 2 | 1, /**< Characters which may signify a quote escape */ + NOT_SPECIAL = 4, /**< Characters with no special meaning or escaped delimiters and newlines */ + DELIMITER = 4 | 2, /**< Characters which signify a new field */ + NEWLINE = 4 | 2 | 1 /**< Characters which signify a new row */ }; - template - inline DataType type_num() { - static_assert(std::is_integral::value, "T should be an integral type."); - static_assert(sizeof(T) <= 8, "Byte size must be no greater than 8."); - return int_type_arr[sizeof(T) - 1]; + /** Transform the ParseFlags given the context of whether or not the current + * field is quote escaped */ + constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept { + return (ParseFlags)((int)flag & ~((int)ParseFlags::QUOTE * quote_escape)); } - template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } - template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } - template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } - template<> inline DataType type_num() { return DataType::CSV_NULL; } - template<> inline DataType type_num() { return DataType::CSV_STRING; } - - CONSTEXPR DataType data_type(csv::string_view in, long double* const out = nullptr); -#endif + // Assumed to be true by parsing functions: allows for testing + // if an item is DELIMITER or NEWLINE with a >= statement + STATIC_ASSERT(ParseFlags::DELIMITER < ParseFlags::NEWLINE); - /** Given a byte size, return the largest number than can be stored in - * an integer of that size + /** Optimizations for reducing branching in parsing loop * - * Note: Provides a platform-agnostic way of mapping names like "long int" to - * byte sizes + * Idea: The meaning of all non-quote characters changes depending + * on whether or not the parser is in a quote-escaped mode (0 or 1) */ - template - CONSTEXPR long double get_int_max() { - static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8, - "Bytes must be a power of 2 below 8."); + STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, false) == ParseFlags::NOT_SPECIAL); + STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, false) == ParseFlags::QUOTE); + STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, false) == ParseFlags::DELIMITER); + STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, false) == ParseFlags::NEWLINE); - IF_CONSTEXPR (sizeof(signed char) == Bytes) { - return (long double)std::numeric_limits::max(); - } + STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, true) == ParseFlags::NOT_SPECIAL); + STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, true) == ParseFlags::QUOTE_ESCAPE_QUOTE); + STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, true) == ParseFlags::NOT_SPECIAL); + STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, true) == ParseFlags::NOT_SPECIAL); - IF_CONSTEXPR (sizeof(short) == Bytes) { - return (long double)std::numeric_limits::max(); - } + /** An array which maps ASCII chars to a parsing flag */ + using ParseFlagMap = std::array; - IF_CONSTEXPR (sizeof(int) == Bytes) { - return (long double)std::numeric_limits::max(); - } + /** An array which maps ASCII chars to a flag indicating if it is whitespace */ + using WhitespaceMap = std::array; + } - IF_CONSTEXPR (sizeof(long int) == Bytes) { - return (long double)std::numeric_limits::max(); - } + /** Integer indicating a requested column wasn't found. */ + constexpr int CSV_NOT_FOUND = -1; +} - IF_CONSTEXPR (sizeof(long long int) == Bytes) { - return (long double)std::numeric_limits::max(); - } - HEDLEY_UNREACHABLE(); - } +namespace csv { + namespace internals { + struct ColNames; + using ColNamesPtr = std::shared_ptr; - /** Given a byte size, return the largest number than can be stored in - * an unsigned integer of that size - */ - template - CONSTEXPR long double get_uint_max() { - static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8, - "Bytes must be a power of 2 below 8."); + /** @struct ColNames + * A data structure for handling column name information. + * + * These are created by CSVReader and passed (via smart pointer) + * to CSVRow objects it creates, thus + * allowing for indexing by column name. + */ + struct ColNames { + public: + ColNames() = default; + ColNames(const std::vector& names) { + set_col_names(names); + } - IF_CONSTEXPR(sizeof(unsigned char) == Bytes) { - return (long double)std::numeric_limits::max(); - } - - IF_CONSTEXPR(sizeof(unsigned short) == Bytes) { - return (long double)std::numeric_limits::max(); - } - - IF_CONSTEXPR(sizeof(unsigned int) == Bytes) { - return (long double)std::numeric_limits::max(); - } - - IF_CONSTEXPR(sizeof(unsigned long int) == Bytes) { - return (long double)std::numeric_limits::max(); - } - - IF_CONSTEXPR(sizeof(unsigned long long int) == Bytes) { - return (long double)std::numeric_limits::max(); - } - - HEDLEY_UNREACHABLE(); - } - - /** Largest number that can be stored in a 8-bit integer */ - CONSTEXPR_VALUE long double CSV_INT8_MAX = get_int_max<1>(); - - /** Largest number that can be stored in a 16-bit integer */ - CONSTEXPR_VALUE long double CSV_INT16_MAX = get_int_max<2>(); - - /** Largest number that can be stored in a 32-bit integer */ - CONSTEXPR_VALUE long double CSV_INT32_MAX = get_int_max<4>(); - - /** Largest number that can be stored in a 64-bit integer */ - CONSTEXPR_VALUE long double CSV_INT64_MAX = get_int_max<8>(); - - /** Largest number that can be stored in a 8-bit ungisned integer */ - CONSTEXPR_VALUE long double CSV_UINT8_MAX = get_uint_max<1>(); - - /** Largest number that can be stored in a 16-bit unsigned integer */ - CONSTEXPR_VALUE long double CSV_UINT16_MAX = get_uint_max<2>(); - - /** Largest number that can be stored in a 32-bit unsigned integer */ - CONSTEXPR_VALUE long double CSV_UINT32_MAX = get_uint_max<4>(); - - /** Largest number that can be stored in a 64-bit unsigned integer */ - CONSTEXPR_VALUE long double CSV_UINT64_MAX = get_uint_max<8>(); - - /** Given a pointer to the start of what is start of - * the exponential part of a number written (possibly) in scientific notation - * parse the exponent - */ - HEDLEY_PRIVATE CONSTEXPR - DataType _process_potential_exponential( - csv::string_view exponential_part, - const long double& coeff, - long double * const out) { - long double exponent = 0; - auto result = data_type(exponential_part, &exponent); - - // Exponents in scientific notation should not be decimal numbers - if (result >= DataType::CSV_INT8 && result < DataType::CSV_DOUBLE) { - if (out) *out = coeff * pow10(exponent); - return DataType::CSV_DOUBLE; - } - - return DataType::CSV_STRING; - } - - /** Given the absolute value of an integer, determine what numeric type - * it fits in - */ - HEDLEY_PRIVATE HEDLEY_PURE CONSTEXPR - DataType _determine_integral_type(const long double& number) noexcept { - // We can assume number is always non-negative - assert(number >= 0); - - if (number <= internals::CSV_INT8_MAX) - return DataType::CSV_INT8; - else if (number <= internals::CSV_INT16_MAX) - return DataType::CSV_INT16; - else if (number <= internals::CSV_INT32_MAX) - return DataType::CSV_INT32; - else if (number <= internals::CSV_INT64_MAX) - return DataType::CSV_INT64; - else // Conversion to long long will cause an overflow - return DataType::CSV_DOUBLE; - } - - /** Distinguishes numeric from other text values. Used by various - * type casting functions, like csv_parser::CSVReader::read_row() - * - * #### Rules - * - Leading and trailing whitespace ("padding") ignored - * - A string of just whitespace is NULL - * - * @param[in] in String value to be examined - * @param[out] out Pointer to long double where results of numeric parsing - * get stored - */ - CONSTEXPR - DataType data_type(csv::string_view in, long double* const out) { - // Empty string --> NULL - if (in.size() == 0) - return DataType::CSV_NULL; - - bool ws_allowed = true, - neg_allowed = true, - dot_allowed = true, - digit_allowed = true, - has_digit = false, - prob_float = false; - - unsigned places_after_decimal = 0; - long double integral_part = 0, - decimal_part = 0; - - for (size_t i = 0, ilen = in.size(); i < ilen; i++) { - const char& current = in[i]; - - switch (current) { - case ' ': - if (!ws_allowed) { - if (isdigit(in[i - 1])) { - digit_allowed = false; - ws_allowed = true; - } - else { - // Ex: '510 123 4567' - return DataType::CSV_STRING; - } - } - break; - case '-': - if (!neg_allowed) { - // Ex: '510-123-4567' - return DataType::CSV_STRING; - } - - neg_allowed = false; - break; - case '.': - if (!dot_allowed) { - return DataType::CSV_STRING; - } - - dot_allowed = false; - prob_float = true; - break; - case 'e': - case 'E': - // Process scientific notation - if (prob_float || (i && i + 1 < ilen && isdigit(in[i - 1]))) { - size_t exponent_start_idx = i + 1; - prob_float = true; - - // Strip out plus sign - if (in[i + 1] == '+') { - exponent_start_idx++; - } - - return _process_potential_exponential( - in.substr(exponent_start_idx), - neg_allowed ? integral_part + decimal_part : -(integral_part + decimal_part), - out - ); - } - - return DataType::CSV_STRING; - break; - default: - short digit = current - '0'; - if (digit >= 0 && digit <= 9) { - // Process digit - has_digit = true; - - if (!digit_allowed) - return DataType::CSV_STRING; - else if (ws_allowed) // Ex: '510 456' - ws_allowed = false; - - // Build current number - if (prob_float) - decimal_part += digit / pow10(++places_after_decimal); - else - integral_part = (integral_part * 10) + digit; - } - else { - return DataType::CSV_STRING; - } - } - } - - // No non-numeric/non-whitespace characters found - if (has_digit) { - long double number = integral_part + decimal_part; - if (out) { - *out = neg_allowed ? number : -number; - } + std::vector get_col_names() const; + void set_col_names(const std::vector&); + int index_of(csv::string_view) const; - return prob_float ? DataType::CSV_DOUBLE : _determine_integral_type(number); - } + bool empty() const noexcept { return this->col_names.empty(); } + size_t size() const noexcept; - // Just whitespace - return DataType::CSV_NULL; - } + private: + std::vector col_names; + std::unordered_map col_pos; + }; } } /** @file * Defines an object used to store CSV format settings */ +#include #include #include #include namespace csv { + namespace internals { + class IBasicCSVParser; + } + class CSVReader; /** Determines how to handle rows that are shorter or longer than the majority */ @@ -5137,7 +4912,7 @@ namespace csv { }; /** Stores information about how to parse a CSV file. - * Can be used to construct a csv::CSVReader. + * Can be used to construct a csv::CSVReader. */ class CSVFormat { public: @@ -5151,7 +4926,7 @@ namespace csv { CSVFormat& delimiter(char delim); /** Sets a list of potential delimiters - * + * * @throws `std::runtime_error` thrown if trim, quote, or possible delimiting characters overlap * @param[in] delim An array of possible delimiters to try parsing the CSV with */ @@ -5182,6 +4957,16 @@ namespace csv { */ CSVFormat& header_row(int row); + /** Tells the parser that this CSV has no header row + * + * @note Equivalent to `header_row(-1)` + * + */ + CSVFormat& no_header() { + this->header_row(-1); + return *this; + } + /** Turn quoting on or off */ CSVFormat& quote(bool use_quote) { this->no_quote = !use_quote; @@ -5189,23 +4974,17 @@ namespace csv { } /** Tells the parser how to handle columns of a different length than the others */ - CONSTEXPR CSVFormat& variable_columns(VariableColumnPolicy policy = VariableColumnPolicy::IGNORE_ROW) { + CONSTEXPR_14 CSVFormat& variable_columns(VariableColumnPolicy policy = VariableColumnPolicy::IGNORE_ROW) { this->variable_column_policy = policy; return *this; } /** Tells the parser how to handle columns of a different length than the others */ - CONSTEXPR CSVFormat& variable_columns(bool policy) { + CONSTEXPR_14 CSVFormat& variable_columns(bool policy) { this->variable_column_policy = (VariableColumnPolicy)policy; return *this; } - /** Tells the parser to detect and remove UTF-8 byte order marks */ - CONSTEXPR CSVFormat& detect_bom(bool detect = true) { - this->unicode_detect = detect; - return *this; - } - #ifndef DOXYGEN_SHOULD_SKIP_THIS char get_delim() const { // This error should never be received by end users. @@ -5223,14 +5002,13 @@ namespace csv { std::vector get_trim_chars() const { return this->trim_chars; } CONSTEXPR VariableColumnPolicy get_variable_column_policy() const { return this->variable_column_policy; } #endif - + /** CSVFormat for guessing the delimiter */ CSV_INLINE static CSVFormat guess_csv() { CSVFormat format; format.delimiter({ ',', '|', '\t', ';', '^' }) .quote('"') - .header_row(0) - .detect_bom(true); + .header_row(0); return format; } @@ -5240,7 +5018,8 @@ namespace csv { } friend CSVReader; - + friend internals::IBasicCSVParser; + private: /**< Throws an error if delimiters and trim characters overlap */ void assert_no_char_overlap(); @@ -5265,1082 +5044,2166 @@ namespace csv { /**< Allow variable length columns? */ VariableColumnPolicy variable_column_policy = VariableColumnPolicy::IGNORE_ROW; - - /**< Detect and strip out Unicode byte order marks */ - bool unicode_detect = true; }; } -#include -#include -#include -#include -#include -#include +/** @file + * Defines the data type used for storing information about a CSV row + */ -#include +#include #include -#include +#include +#include // For CSVField +#include // For CSVField #include #include +#include +#include #include -#include -#include +/** @file + * @brief Implements data type parsing functionality + */ + +#include +#include #include -#include +#include namespace csv { - namespace internals { - struct ColNames; - using ColNamesPtr = std::shared_ptr; + /** Enumerates the different CSV field types that are + * recognized by this library + * + * @note Overflowing integers will be stored and classified as doubles. + * @note Unlike previous releases, integer enums here are platform agnostic. + */ + enum class DataType { + UNKNOWN = -1, + CSV_NULL, /**< Empty string */ + CSV_STRING, /**< Non-numeric string */ + CSV_INT8, /**< 8-bit integer */ + CSV_INT16, /**< 16-bit integer (short on MSVC/GCC) */ + CSV_INT32, /**< 32-bit integer (int on MSVC/GCC) */ + CSV_INT64, /**< 64-bit integer (long long on MSVC/GCC) */ + CSV_BIGINT, /**< Value too big to fit in a 64-bit in */ + CSV_DOUBLE /**< Floating point value */ + }; - /** @struct ColNames - * A data structure for handling column name information. - * - * These are created by CSVReader and passed (via smart pointer) - * to CSVRow objects it creates, thus - * allowing for indexing by column name. + static_assert(DataType::CSV_STRING < DataType::CSV_INT8, "String type should come before numeric types."); + static_assert(DataType::CSV_INT8 < DataType::CSV_INT64, "Smaller integer types should come before larger integer types."); + static_assert(DataType::CSV_INT64 < DataType::CSV_DOUBLE, "Integer types should come before floating point value types."); + + namespace internals { + /** Compute 10 to the power of n */ + template + HEDLEY_CONST CONSTEXPR_14 + long double pow10(const T& n) noexcept { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + // Make all numbers positive + T iterations = n > 0 ? n : -n; + + for (T i = 0; i < iterations; i++) { + ret *= multiplicand; + } + + return ret; + } + + /** Compute 10 to the power of n */ + template<> + HEDLEY_CONST CONSTEXPR_14 + long double pow10(const unsigned& n) noexcept { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + for (unsigned i = 0; i < n; i++) { + ret *= multiplicand; + } + + return ret; + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + /** Private site-indexed array mapping byte sizes to an integer size enum */ + constexpr DataType int_type_arr[8] = { + DataType::CSV_INT8, // 1 + DataType::CSV_INT16, // 2 + DataType::UNKNOWN, + DataType::CSV_INT32, // 4 + DataType::UNKNOWN, + DataType::UNKNOWN, + DataType::UNKNOWN, + DataType::CSV_INT64 // 8 + }; + + template + inline DataType type_num() { + static_assert(std::is_integral::value, "T should be an integral type."); + static_assert(sizeof(T) <= 8, "Byte size must be no greater than 8."); + return int_type_arr[sizeof(T) - 1]; + } + + template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } + template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } + template<> inline DataType type_num() { return DataType::CSV_DOUBLE; } + template<> inline DataType type_num() { return DataType::CSV_NULL; } + template<> inline DataType type_num() { return DataType::CSV_STRING; } + + CONSTEXPR_14 DataType data_type(csv::string_view in, long double* const out = nullptr, + const char decimalsymbol = '.'); +#endif + + /** Given a byte size, return the largest number than can be stored in + * an integer of that size + * + * Note: Provides a platform-agnostic way of mapping names like "long int" to + * byte sizes + */ + template + CONSTEXPR_14 long double get_int_max() { + static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8, + "Bytes must be a power of 2 below 8."); + + IF_CONSTEXPR (sizeof(signed char) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR (sizeof(short) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR (sizeof(int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR (sizeof(long int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR (sizeof(long long int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + HEDLEY_UNREACHABLE(); + } + + /** Given a byte size, return the largest number than can be stored in + * an unsigned integer of that size + */ + template + CONSTEXPR_14 long double get_uint_max() { + static_assert(Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8, + "Bytes must be a power of 2 below 8."); + + IF_CONSTEXPR(sizeof(unsigned char) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR(sizeof(unsigned short) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR(sizeof(unsigned int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR(sizeof(unsigned long int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + IF_CONSTEXPR(sizeof(unsigned long long int) == Bytes) { + return (long double)std::numeric_limits::max(); + } + + HEDLEY_UNREACHABLE(); + } + + /** Largest number that can be stored in a 8-bit integer */ + CONSTEXPR_VALUE_14 long double CSV_INT8_MAX = get_int_max<1>(); + + /** Largest number that can be stored in a 16-bit integer */ + CONSTEXPR_VALUE_14 long double CSV_INT16_MAX = get_int_max<2>(); + + /** Largest number that can be stored in a 32-bit integer */ + CONSTEXPR_VALUE_14 long double CSV_INT32_MAX = get_int_max<4>(); + + /** Largest number that can be stored in a 64-bit integer */ + CONSTEXPR_VALUE_14 long double CSV_INT64_MAX = get_int_max<8>(); + + /** Largest number that can be stored in a 8-bit ungisned integer */ + CONSTEXPR_VALUE_14 long double CSV_UINT8_MAX = get_uint_max<1>(); + + /** Largest number that can be stored in a 16-bit unsigned integer */ + CONSTEXPR_VALUE_14 long double CSV_UINT16_MAX = get_uint_max<2>(); + + /** Largest number that can be stored in a 32-bit unsigned integer */ + CONSTEXPR_VALUE_14 long double CSV_UINT32_MAX = get_uint_max<4>(); + + /** Largest number that can be stored in a 64-bit unsigned integer */ + CONSTEXPR_VALUE_14 long double CSV_UINT64_MAX = get_uint_max<8>(); + + /** Given a pointer to the start of what is start of + * the exponential part of a number written (possibly) in scientific notation + * parse the exponent + */ + HEDLEY_PRIVATE CONSTEXPR_14 + DataType _process_potential_exponential( + csv::string_view exponential_part, + const long double& coeff, + long double * const out) { + long double exponent = 0; + auto result = data_type(exponential_part, &exponent); + + // Exponents in scientific notation should not be decimal numbers + if (result >= DataType::CSV_INT8 && result < DataType::CSV_DOUBLE) { + if (out) *out = coeff * pow10(exponent); + return DataType::CSV_DOUBLE; + } + + return DataType::CSV_STRING; + } + + /** Given the absolute value of an integer, determine what numeric type + * it fits in + */ + HEDLEY_PRIVATE HEDLEY_PURE CONSTEXPR_14 + DataType _determine_integral_type(const long double& number) noexcept { + // We can assume number is always non-negative + assert(number >= 0); + + if (number <= internals::CSV_INT8_MAX) + return DataType::CSV_INT8; + else if (number <= internals::CSV_INT16_MAX) + return DataType::CSV_INT16; + else if (number <= internals::CSV_INT32_MAX) + return DataType::CSV_INT32; + else if (number <= internals::CSV_INT64_MAX) + return DataType::CSV_INT64; + else // Conversion to long long will cause an overflow + return DataType::CSV_BIGINT; + } + + /** Distinguishes numeric from other text values. Used by various + * type casting functions, like csv_parser::CSVReader::read_row() + * + * #### Rules + * - Leading and trailing whitespace ("padding") ignored + * - A string of just whitespace is NULL + * + * @param[in] in String value to be examined + * @param[out] out Pointer to long double where results of numeric parsing + * get stored + * @param[in] decimalSymbol the character separating integral and decimal part, + * defaults to '.' if omitted + */ + CONSTEXPR_14 + DataType data_type(csv::string_view in, long double* const out, const char decimalSymbol) { + // Empty string --> NULL + if (in.size() == 0) + return DataType::CSV_NULL; + + bool ws_allowed = true, + dot_allowed = true, + digit_allowed = true, + is_negative = false, + has_digit = false, + prob_float = false; + + unsigned places_after_decimal = 0; + long double integral_part = 0, + decimal_part = 0; + + for (size_t i = 0, ilen = in.size(); i < ilen; i++) { + const char& current = in[i]; + + switch (current) { + case ' ': + if (!ws_allowed) { + if (isdigit(in[i - 1])) { + digit_allowed = false; + ws_allowed = true; + } + else { + // Ex: '510 123 4567' + return DataType::CSV_STRING; + } + } + break; + case '+': + if (!ws_allowed) { + return DataType::CSV_STRING; + } + + break; + case '-': + if (!ws_allowed) { + // Ex: '510-123-4567' + return DataType::CSV_STRING; + } + + is_negative = true; + break; + // case decimalSymbol: not allowed because decimalSymbol is not a literal, + // it is handled in the default block + case 'e': + case 'E': + // Process scientific notation + if (prob_float || (i && i + 1 < ilen && isdigit(in[i - 1]))) { + size_t exponent_start_idx = i + 1; + prob_float = true; + + // Strip out plus sign + if (in[i + 1] == '+') { + exponent_start_idx++; + } + + return _process_potential_exponential( + in.substr(exponent_start_idx), + is_negative ? -(integral_part + decimal_part) : integral_part + decimal_part, + out + ); + } + + return DataType::CSV_STRING; + break; + default: + short digit = static_cast(current - '0'); + if (digit >= 0 && digit <= 9) { + // Process digit + has_digit = true; + + if (!digit_allowed) + return DataType::CSV_STRING; + else if (ws_allowed) // Ex: '510 456' + ws_allowed = false; + + // Build current number + if (prob_float) + decimal_part += digit / pow10(++places_after_decimal); + else + integral_part = (integral_part * 10) + digit; + } + // case decimalSymbol: not allowed because decimalSymbol is not a literal. + else if (dot_allowed && current == decimalSymbol) { + dot_allowed = false; + prob_float = true; + } + else { + return DataType::CSV_STRING; + } + } + } + + // No non-numeric/non-whitespace characters found + if (has_digit) { + long double number = integral_part + decimal_part; + if (out) { + *out = is_negative ? -number : number; + } + + return prob_float ? DataType::CSV_DOUBLE : _determine_integral_type(number); + } + + // Just whitespace + return DataType::CSV_NULL; + } + } +} + +namespace csv { + namespace internals { + class IBasicCSVParser; + + static const std::string ERROR_NAN = "Not a number."; + static const std::string ERROR_OVERFLOW = "Overflow error."; + static const std::string ERROR_FLOAT_TO_INT = + "Attempted to convert a floating point value to an integral type."; + static const std::string ERROR_NEG_TO_UNSIGNED = "Negative numbers cannot be converted to unsigned types."; + + std::string json_escape_string(csv::string_view s) noexcept; + + /** A barebones class used for describing CSV fields */ + struct RawCSVField { + RawCSVField() = default; + RawCSVField(size_t _start, size_t _length, bool _double_quote = false) { + start = _start; + length = _length; + has_double_quote = _double_quote; + } + + /** The start of the field, relative to the beginning of the row */ + size_t start; + + /** The length of the row, ignoring quote escape characters */ + size_t length; + + /** Whether or not the field contains an escaped quote */ + bool has_double_quote; + }; + + /** A class used for efficiently storing RawCSVField objects and expanding as necessary + * + * @par Implementation + * This data structure stores RawCSVField in continguous blocks. When more capacity + * is needed, a new block is allocated, but previous data stays put. + * + * @par Thread Safety + * This class may be safely read from multiple threads and written to from one, + * as long as the writing thread does not actively touch fields which are being + * read. + */ + class CSVFieldList { + public: + /** Construct a CSVFieldList which allocates blocks of a certain size */ + CSVFieldList(size_t single_buffer_capacity = (size_t)(internals::PAGE_SIZE / sizeof(RawCSVField))) : + _single_buffer_capacity(single_buffer_capacity) { + this->allocate(); + } + + // No copy constructor + CSVFieldList(const CSVFieldList& other) = delete; + + // CSVFieldArrays may be moved + CSVFieldList(CSVFieldList&& other) : + _single_buffer_capacity(other._single_buffer_capacity) { + + for (auto&& buffer : other.buffers) { + this->buffers.emplace_back(std::move(buffer)); + } + + _current_buffer_size = other._current_buffer_size; + _back = other._back; + } + + template + void emplace_back(Args&&... args) { + if (this->_current_buffer_size == this->_single_buffer_capacity) { + this->allocate(); + } + + *(_back++) = RawCSVField(std::forward(args)...); + _current_buffer_size++; + } + + size_t size() const noexcept { + return this->_current_buffer_size + ((this->buffers.size() - 1) * this->_single_buffer_capacity); + } + + RawCSVField& operator[](size_t n) const; + + private: + const size_t _single_buffer_capacity; + + /** + * Prefer std::deque over std::vector because it does not + * reallocate upon expansion, allowing pointers to its members + * to remain valid & avoiding potential race conditions when + * CSVFieldList is accesssed simulatenously by a reading thread and + * a writing thread + */ + std::deque> buffers = {}; + + /** Number of items in the current buffer */ + size_t _current_buffer_size = 0; + + /** Pointer to the current empty field */ + RawCSVField* _back = nullptr; + + /** Allocate a new page of memory */ + void allocate(); + }; + + /** A class for storing raw CSV data and associated metadata */ + struct RawCSVData { + std::shared_ptr _data = nullptr; + csv::string_view data = ""; + + internals::CSVFieldList fields; + + std::unordered_set has_double_quotes = {}; + + // TODO: Consider replacing with a more thread-safe structure + std::unordered_map double_quote_fields = {}; + + internals::ColNamesPtr col_names = nullptr; + internals::ParseFlagMap parse_flags; + internals::WhitespaceMap ws_flags; + }; + + using RawCSVDataPtr = std::shared_ptr; + } + + /** + * @class CSVField + * @brief Data type representing individual CSV values. + * CSVFields can be obtained by using CSVRow::operator[] + */ + class CSVField { + public: + /** Constructs a CSVField from a string_view */ + constexpr explicit CSVField(csv::string_view _sv) noexcept : sv(_sv) { }; + + operator std::string() const { + return std::string(" ") + std::string(this->sv); + } + + /** Returns the value casted to the requested type, performing type checking before. + * + * \par Valid options for T + * - std::string or csv::string_view + * - signed integral types (signed char, short, int, long int, long long int) + * - floating point types (float, double, long double) + * - unsigned integers are not supported at this time, but may be in a later release + * + * \par Invalid conversions + * - Converting non-numeric values to any numeric type + * - Converting floating point values to integers + * - Converting a large integer to a smaller type that will not hold it + * + * @note This method is capable of parsing scientific E-notation. + * See [this page](md_docs_source_scientific_notation.html) + * for more details. + * + * @throws std::runtime_error Thrown if an invalid conversion is performed. + * + * @warning Currently, conversions to floating point types are not + * checked for loss of precision + * + * @warning Any string_views returned are only guaranteed to be valid + * if the parent CSVRow is still alive. If you are concerned + * about object lifetimes, then grab a std::string or a + * numeric value. + * + */ + template T get() { + IF_CONSTEXPR(std::is_arithmetic::value) { + // Note: this->type() also converts the CSV value to float + if (this->type() <= DataType::CSV_STRING) { + throw std::runtime_error(internals::ERROR_NAN); + } + } + + IF_CONSTEXPR(std::is_integral::value) { + // Note: this->is_float() also converts the CSV value to float + if (this->is_float()) { + throw std::runtime_error(internals::ERROR_FLOAT_TO_INT); + } + + IF_CONSTEXPR(std::is_unsigned::value) { + if (this->value < 0) { + throw std::runtime_error(internals::ERROR_NEG_TO_UNSIGNED); + } + } + } + + // Allow fallthrough from previous if branch + IF_CONSTEXPR(!std::is_floating_point::value) { + IF_CONSTEXPR(std::is_unsigned::value) { + // Quick hack to perform correct unsigned integer boundary checks + if (this->value > internals::get_uint_max()) { + throw std::runtime_error(internals::ERROR_OVERFLOW); + } + } + else if (internals::type_num() < this->_type) { + throw std::runtime_error(internals::ERROR_OVERFLOW); + } + } + + return static_cast(this->value); + } + + /** Parse a hexadecimal value, returning false if the value is not hex. */ + bool try_parse_hex(int& parsedValue); + + /** Attempts to parse a decimal (or integer) value using the given symbol, + * returning `true` if the value is numeric. + * + * @note This method also updates this field's type + * + */ + bool try_parse_decimal(long double& dVal, const char decimalSymbol = '.'); + + /** Compares the contents of this field to a numeric value. If this + * field does not contain a numeric value, then all comparisons return + * false. + * + * @note Floating point values are considered equal if they are within + * `0.000001` of each other. + * + * @warning Multiple numeric comparisons involving the same field can + * be done more efficiently by calling the CSVField::get<>() method. + * + * @sa csv::CSVField::operator==(const char * other) + * @sa csv::CSVField::operator==(csv::string_view other) + */ + template + CONSTEXPR_14 bool operator==(T other) const noexcept + { + static_assert(std::is_arithmetic::value, + "T should be a numeric value."); + + if (this->_type != DataType::UNKNOWN) { + if (this->_type == DataType::CSV_STRING) { + return false; + } + + return internals::is_equal(value, static_cast(other), 0.000001L); + } + + long double out = 0; + if (internals::data_type(this->sv, &out) == DataType::CSV_STRING) { + return false; + } + + return internals::is_equal(out, static_cast(other), 0.000001L); + } + + /** Return a string view over the field's contents */ + CONSTEXPR csv::string_view get_sv() const noexcept { return this->sv; } + + /** Returns true if field is an empty string or string of whitespace characters */ + CONSTEXPR_14 bool is_null() noexcept { return type() == DataType::CSV_NULL; } + + /** Returns true if field is a non-numeric, non-empty string */ + CONSTEXPR_14 bool is_str() noexcept { return type() == DataType::CSV_STRING; } + + /** Returns true if field is an integer or float */ + CONSTEXPR_14 bool is_num() noexcept { return type() >= DataType::CSV_INT8; } + + /** Returns true if field is an integer */ + CONSTEXPR_14 bool is_int() noexcept { + return (type() >= DataType::CSV_INT8) && (type() <= DataType::CSV_INT64); + } + + /** Returns true if field is a floating point value */ + CONSTEXPR_14 bool is_float() noexcept { return type() == DataType::CSV_DOUBLE; }; + + /** Return the type of the underlying CSV data */ + CONSTEXPR_14 DataType type() noexcept { + this->get_value(); + return _type; + } + + private: + long double value = 0; /**< Cached numeric value */ + csv::string_view sv = ""; /**< A pointer to this field's text */ + DataType _type = DataType::UNKNOWN; /**< Cached data type value */ + CONSTEXPR_14 void get_value() noexcept { + /* Check to see if value has been cached previously, if not + * evaluate it */ - struct ColNames { - public: - ColNames() = default; - ColNames(const std::vector& names) { - set_col_names(names); + if ((int)_type < 0) { + this->_type = internals::data_type(this->sv, &this->value); } + } + }; + + /** Data structure for representing CSV rows */ + class CSVRow { + public: + friend internals::IBasicCSVParser; + + CSVRow() = default; + + /** Construct a CSVRow from a RawCSVDataPtr */ + CSVRow(internals::RawCSVDataPtr _data) : data(_data) {} + CSVRow(internals::RawCSVDataPtr _data, size_t _data_start, size_t _field_bounds) + : data(_data), data_start(_data_start), fields_start(_field_bounds) {} + + /** Indicates whether row is empty or not */ + CONSTEXPR bool empty() const noexcept { return this->size() == 0; } + + /** Return the number of fields in this row */ + CONSTEXPR size_t size() const noexcept { return row_length; } + + /** @name Value Retrieval */ + ///@{ + CSVField operator[](size_t n) const; + CSVField operator[](const std::string&) const; + std::string to_json(const std::vector& subset = {}) const; + std::string to_json_array(const std::vector& subset = {}) const; + + /** Retrieve this row's associated column names */ + std::vector get_col_names() const { + return this->data->col_names->get_col_names(); + } + + /** Convert this CSVRow into a vector of strings. + * **Note**: This is a less efficient method of + * accessing data than using the [] operator. + */ + operator std::vector() const; + ///@} + + /** A random access iterator over the contents of a CSV row. + * Each iterator points to a CSVField. + */ + class iterator { + public: +#ifndef DOXYGEN_SHOULD_SKIP_THIS + using value_type = CSVField; + using difference_type = int; + using pointer = std::shared_ptr; + using reference = CSVField & ; + using iterator_category = std::random_access_iterator_tag; +#endif + iterator(const CSVRow*, int i); + + reference operator*() const; + pointer operator->() const; + + iterator operator++(int); + iterator& operator++(); + iterator operator--(int); + iterator& operator--(); + iterator operator+(difference_type n) const; + iterator operator-(difference_type n) const; + + /** Two iterators are equal if they point to the same field */ + CONSTEXPR bool operator==(const iterator& other) const noexcept { + return this->i == other.i; + }; + + CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); } + +#ifndef NDEBUG + friend CSVRow; +#endif + + private: + const CSVRow * daddy = nullptr; // Pointer to parent + std::shared_ptr field = nullptr; // Current field pointed at + int i = 0; // Index of current field + }; + + /** A reverse iterator over the contents of a CSVRow. */ + using reverse_iterator = std::reverse_iterator; + + /** @name Iterators + * @brief Each iterator points to a CSVField object. + */ + ///@{ + iterator begin() const; + iterator end() const noexcept; + reverse_iterator rbegin() const noexcept; + reverse_iterator rend() const; + ///@} + + private: + /** Retrieve a string view corresponding to the specified index */ + csv::string_view get_field(size_t index) const; + + internals::RawCSVDataPtr data; + + /** Where in RawCSVData.data we start */ + size_t data_start = 0; + + /** Where in the RawCSVDataPtr.fields array we start */ + size_t fields_start = 0; + + /** How many columns this row spans */ + size_t row_length = 0; + }; + +#ifdef _MSC_VER +#pragma region CSVField::get Specializations +#endif + /** Retrieve this field's original string */ + template<> + inline std::string CSVField::get() { + return std::string(this->sv); + } + + /** Retrieve a view over this field's string + * + * @warning This string_view is only guaranteed to be valid as long as this + * CSVRow is still alive. + */ + template<> + CONSTEXPR_14 csv::string_view CSVField::get() { + return this->sv; + } - std::vector get_col_names() const; - void set_col_names(const std::vector&); - int index_of(csv::string_view) const; + /** Retrieve this field's value as a long double */ + template<> + CONSTEXPR_14 long double CSVField::get() { + if (!is_num()) + throw std::runtime_error(internals::ERROR_NAN); - bool empty() const { return this->col_names.empty(); } - size_t size() const; + return this->value; + } +#ifdef _MSC_VER +#pragma endregion CSVField::get Specializations +#endif - private: - std::vector col_names; - std::unordered_map col_pos; - }; + /** Compares the contents of this field to a string */ + template<> + CONSTEXPR bool CSVField::operator==(const char * other) const noexcept + { + return this->sv == other; + } + + /** Compares the contents of this field to a string */ + template<> + CONSTEXPR bool CSVField::operator==(csv::string_view other) const noexcept + { + return this->sv == other; } } -/** @file - * Defines the data type used for storing information about a CSV row - */ -#include -#include -#include -#include -#include // For ColNames -#include -#include // For CSVField -#include // For CSVField -#include +inline std::ostream& operator << (std::ostream& os, csv::CSVField const& value) { + os << std::string(value); + return os; +} namespace csv { - class BasicCSVParser; + namespace internals { + /** Create a vector v where each index i corresponds to the + * ASCII number for a character and, v[i + 128] labels it according to + * the CSVReader::ParseFlags enum + */ + HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter) { + std::array ret = {}; + for (int i = -128; i < 128; i++) { + const int arr_idx = i + 128; + char ch = char(i); - struct RawCSVField { - size_t start; - size_t length; - }; + if (ch == delimiter) + ret[arr_idx] = ParseFlags::DELIMITER; + else if (ch == '\r' || ch == '\n') + ret[arr_idx] = ParseFlags::NEWLINE; + else + ret[arr_idx] = ParseFlags::NOT_SPECIAL; + } - namespace internals { - static const std::string ERROR_NAN = "Not a number."; - static const std::string ERROR_OVERFLOW = "Overflow error."; - static const std::string ERROR_FLOAT_TO_INT = - "Attempted to convert a floating point value to an integral type."; - static const std::string ERROR_NEG_TO_UNSIGNED = "Negative numbers cannot be converted to unsigned types."; + return ret; + } - std::string json_escape_string(csv::string_view s) noexcept; + /** Create a vector v where each index i corresponds to the + * ASCII number for a character and, v[i + 128] labels it according to + * the CSVReader::ParseFlags enum + */ + HEDLEY_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter, char quote_char) { + std::array ret = make_parse_flags(delimiter); + ret[(size_t)quote_char + 128] = ParseFlags::QUOTE; + return ret; + } + + /** Create a vector v where each index i corresponds to the + * ASCII number for a character c and, v[i + 128] is true if + * c is a whitespace character + */ + HEDLEY_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char* ws_chars, size_t n_chars) { + std::array ret = {}; + for (int i = -128; i < 128; i++) { + const int arr_idx = i + 128; + char ch = char(i); + ret[arr_idx] = false; + + for (size_t j = 0; j < n_chars; j++) { + if (ws_chars[j] == ch) { + ret[arr_idx] = true; + } + } + } + + return ret; + } - /** A class used for efficiently storing RawCSVField objects and expanding as necessary */ - class CSVFieldArray { + inline WhitespaceMap make_ws_flags(const std::vector& flags) { + return make_ws_flags(flags.data(), flags.size()); + } + + CSV_INLINE size_t get_file_size(csv::string_view filename); + + CSV_INLINE std::string get_csv_head(csv::string_view filename); + + /** Read the first 500KB of a CSV file */ + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size); + + /** A std::deque wrapper which allows multiple read and write threads to concurrently + * access it along with providing read threads the ability to wait for the deque + * to become populated + */ + template + class ThreadSafeDeque { public: - CSVFieldArray() { - this->allocate(); + ThreadSafeDeque(size_t notify_size = 100) : _notify_size(notify_size) {}; + ThreadSafeDeque(const ThreadSafeDeque& other) { + this->data = other.data; + this->_notify_size = other._notify_size; } - RawCSVField& operator[](size_t n) { - if (n > this->size()) { - throw std::runtime_error("Index out of bounds."); - } + ThreadSafeDeque(const std::deque& source) : ThreadSafeDeque() { + this->data = source; + } - size_t page_no = (size_t)std::floor((double)(n / this->single_buffer_capacity)); - size_t buffer_idx = (page_no < 1) ? n : n % this->single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; + void clear() noexcept { this->data.clear(); } + + bool empty() const noexcept { + return this->data.empty(); } - void push_back(RawCSVField&& field) { - if (this->_current_buffer_size == this->single_buffer_capacity) { - this->allocate(); + T& front() noexcept { + return this->data.front(); + } + + T& operator[](size_t n) { + return this->data[n]; + } + + void push_back(T&& item) { + std::lock_guard lock{ this->_lock }; + this->data.push_back(std::move(item)); + + if (this->size() >= _notify_size) { + this->_cond.notify_all(); } + } - this->buffers.back()[this->_current_buffer_size] = std::move(field); - this->_current_buffer_size++; - this->_size++; + T pop_front() noexcept { + std::lock_guard lock{ this->_lock }; + T item = std::move(data.front()); + data.pop_front(); + return item; } - ~CSVFieldArray() { - for (auto& buffer : buffers) { - delete[] buffer; + size_t size() const noexcept { return this->data.size(); } + + /** Returns true if a thread is actively pushing items to this deque */ + constexpr bool is_waitable() const noexcept { return this->_is_waitable; } + + /** Wait for an item to become available */ + void wait() { + if (!is_waitable()) { + return; } + + std::unique_lock lock{ this->_lock }; + this->_cond.wait(lock, [this] { return this->size() >= _notify_size || !this->is_waitable(); }); + lock.unlock(); } - CONSTEXPR size_t size() const noexcept { - return this->_size; + typename std::deque::iterator begin() noexcept { + return this->data.begin(); } - private: - const size_t single_buffer_capacity = (size_t)(internals::PAGE_SIZE / alignof(RawCSVField)); + typename std::deque::iterator end() noexcept { + return this->data.end(); + } - std::vector buffers = {}; - size_t _current_buffer_size = 0; - size_t _size = 0; + /** Tell listeners that this deque is actively being pushed to */ + void notify_all() { + std::unique_lock lock{ this->_lock }; + this->_is_waitable = true; + this->_cond.notify_all(); + } - /** Allocate a new page of memory */ - void allocate(); + /** Tell all listeners to stop */ + void kill_all() { + std::unique_lock lock{ this->_lock }; + this->_is_waitable = false; + this->_cond.notify_all(); + } + + private: + bool _is_waitable = false; + size_t _notify_size; + std::mutex _lock; + std::condition_variable _cond; + std::deque data; }; + + constexpr const int UNINITIALIZED_FIELD = -1; } - /** A class for storing raw CSV data and associated metadata */ - struct RawCSVData { - std::string data = ""; - internals::CSVFieldArray fields; + /** Standard type for storing collection of rows */ + using RowCollection = internals::ThreadSafeDeque; - std::unordered_set has_double_quotes = {}; - std::unordered_map double_quote_fields = {}; - internals::ColNamesPtr col_names = nullptr; - }; + namespace internals { + /** Abstract base class which provides CSV parsing logic. + * + * Concrete implementations may customize this logic across + * different input sources, such as memory mapped files, stringstreams, + * etc... + */ + class IBasicCSVParser { + public: + IBasicCSVParser() = default; + IBasicCSVParser(const CSVFormat&, const ColNamesPtr&); + IBasicCSVParser(const ParseFlagMap& parse_flags, const WhitespaceMap& ws_flags + ) : _parse_flags(parse_flags), _ws_flags(ws_flags) {}; - using RawCSVDataPtr = std::shared_ptr; + virtual ~IBasicCSVParser() {} - /** - * @class CSVField - * @brief Data type representing individual CSV values. - * CSVFields can be obtained by using CSVRow::operator[] - */ - class CSVField { - public: - /** Constructs a CSVField from a string_view */ - constexpr explicit CSVField(csv::string_view _sv) : sv(_sv) { }; + /** Whether or not we have reached the end of source */ + bool eof() { return this->_eof; } - operator std::string() const { - return std::string(" ") + std::string(this->sv); - } + /** Parse the next block of data */ + virtual void next(size_t bytes) = 0; - /** Returns the value casted to the requested type, performing type checking before. - * - * \par Valid options for T - * - std::string or csv::string_view - * - signed integral types (signed char, short, int, long int, long long int) - * - floating point types (float, double, long double) - * - unsigned integers are not supported at this time, but may be in a later release - * - * \par Invalid conversions - * - Converting non-numeric values to any numeric type - * - Converting floating point values to integers - * - Converting a large integer to a smaller type that will not hold it - * - * @note This method is capable of parsing scientific E-notation. - * See [this page](md_docs_source_scientific_notation.html) - * for more details. - * - * @throws std::runtime_error Thrown if an invalid conversion is performed. - * - * @warning Currently, conversions to floating point types are not - * checked for loss of precision - * - * @warning Any string_views returned are only guaranteed to be valid - * if the parent CSVRow is still alive. If you are concerned - * about object lifetimes, then grab a std::string or a - * numeric value. - * - */ - template T get() { - IF_CONSTEXPR(std::is_arithmetic::value) { - // Note: this->type() also converts the CSV value to float - if (this->type() <= DataType::CSV_STRING) { - throw std::runtime_error(internals::ERROR_NAN); - } + /** Indicate the last block of data has been parsed */ + void end_feed(); + + CONSTEXPR_17 ParseFlags parse_flag(const char ch) const noexcept { + return _parse_flags.data()[ch + 128]; } - IF_CONSTEXPR(std::is_integral::value) { - // Note: this->is_float() also converts the CSV value to float - if (this->is_float()) { - throw std::runtime_error(internals::ERROR_FLOAT_TO_INT); + CONSTEXPR_17 ParseFlags compound_parse_flag(const char ch) const noexcept { + return quote_escape_flag(parse_flag(ch), this->quote_escape); + } + + /** Whether or not this CSV has a UTF-8 byte order mark */ + CONSTEXPR bool utf8_bom() const { return this->_utf8_bom; } + + void set_output(RowCollection& rows) { this->_records = &rows; } + + protected: + /** @name Current Parser State */ + ///@{ + CSVRow current_row; + RawCSVDataPtr data_ptr = nullptr; + ColNamesPtr _col_names = nullptr; + CSVFieldList* fields = nullptr; + int field_start = UNINITIALIZED_FIELD; + size_t field_length = 0; + + /** An array where the (i + 128)th slot gives the ParseFlags for ASCII character i */ + ParseFlagMap _parse_flags; + ///@} + + /** @name Current Stream/File State */ + ///@{ + bool _eof = false; + + /** The size of the incoming CSV */ + size_t source_size = 0; + ///@} + + /** Whether or not source needs to be read in chunks */ + CONSTEXPR bool no_chunk() const { return this->source_size < ITERATION_CHUNK_SIZE; } + + /** Parse the current chunk of data * + * + * @returns How many character were read that are part of complete rows + */ + size_t parse(); + + /** Create a new RawCSVDataPtr for a new chunk of data */ + void reset_data_ptr(); + private: + /** An array where the (i + 128)th slot determines whether ASCII character i should + * be trimmed + */ + WhitespaceMap _ws_flags; + bool quote_escape = false; + bool field_has_double_quote = false; + + /** Where we are in the current data block */ + size_t data_pos = 0; + + /** Whether or not an attempt to find Unicode BOM has been made */ + bool unicode_bom_scan = false; + bool _utf8_bom = false; + + /** Where complete rows should be pushed to */ + RowCollection* _records = nullptr; + + CONSTEXPR_17 bool ws_flag(const char ch) const noexcept { + return _ws_flags.data()[ch + 128]; + } + + size_t& current_row_start() { + return this->current_row.data_start; + } + + void parse_field() noexcept; + + /** Finish parsing the current field */ + void push_field(); + + /** Finish parsing the current row */ + void push_row(); + + /** Handle possible Unicode byte order mark */ + void trim_utf8_bom(); + }; + + /** A class for parsing CSV data from a `std::stringstream` + * or an `std::ifstream` + */ + template + class StreamParser: public IBasicCSVParser { + using RowCollection = ThreadSafeDeque; + + public: + StreamParser(TStream& source, + const CSVFormat& format, + const ColNamesPtr& col_names = nullptr + ) : IBasicCSVParser(format, col_names), _source(std::move(source)) {}; + + StreamParser( + TStream& source, + internals::ParseFlagMap parse_flags, + internals::WhitespaceMap ws_flags) : + IBasicCSVParser(parse_flags, ws_flags), + _source(std::move(source)) + {}; + + ~StreamParser() {} + + void next(size_t bytes = ITERATION_CHUNK_SIZE) override { + if (this->eof()) return; + + this->reset_data_ptr(); + this->data_ptr->_data = std::make_shared(); + + if (source_size == 0) { + const auto start = _source.tellg(); + _source.seekg(0, std::ios::end); + const auto end = _source.tellg(); + _source.seekg(0, std::ios::beg); + + source_size = end - start; } - IF_CONSTEXPR(std::is_unsigned::value) { - if (this->value < 0) { - throw std::runtime_error(internals::ERROR_NEG_TO_UNSIGNED); - } + // Read data into buffer + size_t length = std::min(source_size - stream_pos, bytes); + std::unique_ptr buff(new char[length]); + _source.seekg(stream_pos, std::ios::beg); + _source.read(buff.get(), length); + stream_pos = _source.tellg(); + ((std::string*)(this->data_ptr->_data.get()))->assign(buff.get(), length); + + // Create string_view + this->data_ptr->data = *((std::string*)this->data_ptr->_data.get()); + + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); + + if (stream_pos == source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } + else { + this->stream_pos -= (length - remainder); } } - // Allow fallthrough from previous if branch - IF_CONSTEXPR(!std::is_floating_point::value) { - IF_CONSTEXPR(std::is_unsigned::value) { - // Quick hack to perform correct unsigned integer boundary checks - if (this->value > internals::get_uint_max()) { - throw std::runtime_error(internals::ERROR_OVERFLOW); - } - } - else if (internals::type_num() < this->_type) { - throw std::runtime_error(internals::ERROR_OVERFLOW); - } - } + private: + TStream _source; + size_t stream_pos = 0; + }; + + /** Parser for memory-mapped files + * + * @par Implementation + * This class constructs moving windows over a file to avoid + * creating massive memory maps which may require more RAM + * than the user has available. It contains logic to automatically + * re-align each memory map to the beginning of a CSV row. + * + */ + class MmapParser : public IBasicCSVParser { + public: + MmapParser(csv::string_view filename, + const CSVFormat& format, + const ColNamesPtr& col_names = nullptr + ) : IBasicCSVParser(format, col_names) { + this->_filename = filename.data(); + this->source_size = get_file_size(filename); + }; + + ~MmapParser() {} + + void next(size_t bytes) override; + + private: + std::string _filename; + size_t mmap_pos = 0; + }; + } +} + + +/** The all encompassing namespace */ +namespace csv { + /** Stuff that is generally not of interest to end-users */ + namespace internals { + std::string format_row(const std::vector& row, csv::string_view delim = ", "); + + std::vector _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv()); + + struct GuessScore { + double score; + size_t header; + }; + + CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format); + + CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); + } + + std::vector get_col_names( + csv::string_view filename, + const CSVFormat format = CSVFormat::guess_csv()); - return static_cast(this->value); - } + /** Guess the delimiter used by a delimiter-separated values file */ + CSVGuessResult guess_format(csv::string_view filename, + const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); - /** Compares the contents of this field to a numeric value. If this - * field does not contain a numeric value, then all comparisons return - * false. - * - * @note Floating point values are considered equal if they are within - * `0.000001` of each other. + /** @class CSVReader + * @brief Main class for parsing CSVs from files and in-memory sources + * + * All rows are compared to the column names for length consistency + * - By default, rows that are too short or too long are dropped + * - Custom behavior can be defined by overriding bad_row_handler in a subclass + */ + class CSVReader { + public: + /** + * An input iterator capable of handling large files. + * @note Created by CSVReader::begin() and CSVReader::end(). * - * @warning Multiple numeric comparisons involving the same field can - * be done more efficiently by calling the CSVField::get<>() method. + * @par Iterating over a file + * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 1 * - * @sa csv::CSVField::operator==(const char * other) - * @sa csv::CSVField::operator==(csv::string_view other) + * @par Using with `` library + * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 2 */ - template - bool operator==(T other) const - { - static_assert(std::is_arithmetic::value, - "T should be a numeric value."); - - if (this->_type != DataType::UNKNOWN) { - if (this->_type == DataType::CSV_STRING) { - return false; - } + class iterator { + public: + #ifndef DOXYGEN_SHOULD_SKIP_THIS + using value_type = CSVRow; + using difference_type = std::ptrdiff_t; + using pointer = CSVRow * ; + using reference = CSVRow & ; + using iterator_category = std::input_iterator_tag; + #endif - return internals::is_equal(value, static_cast(other), 0.000001L); - } + iterator() = default; + iterator(CSVReader* reader) : daddy(reader) {}; + iterator(CSVReader*, CSVRow&&); - long double out = 0; - if (internals::data_type(this->sv, &out) == DataType::CSV_STRING) { - return false; - } + /** Access the CSVRow held by the iterator */ + CONSTEXPR_14 reference operator*() { return this->row; } - return internals::is_equal(out, static_cast(other), 0.000001L); - } + /** Return a pointer to the CSVRow the iterator has stopped at */ + CONSTEXPR_14 pointer operator->() { return &(this->row); } - /** Return a string view over the field's contents */ - CONSTEXPR csv::string_view get_sv() const { return this->sv; } + iterator& operator++(); /**< Pre-increment iterator */ + iterator operator++(int); /**< Post-increment iterator */ - /** Returns true if field is an empty string or string of whitespace characters */ - CONSTEXPR bool is_null() { return type() == DataType::CSV_NULL; } + /** Returns true if iterators were constructed from the same CSVReader + * and point to the same row + */ + CONSTEXPR bool operator==(const iterator& other) const noexcept { + return (this->daddy == other.daddy) && (this->i == other.i); + } - /** Returns true if field is a non-numeric, non-empty string */ - CONSTEXPR bool is_str() { return type() == DataType::CSV_STRING; } + CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); } + private: + CSVReader * daddy = nullptr; // Pointer to parent + CSVRow row; // Current row + size_t i = 0; // Index of current row + }; - /** Returns true if field is an integer or float */ - CONSTEXPR bool is_num() { return type() >= DataType::CSV_INT8; } + /** @name Constructors + * Constructors for iterating over large files and parsing in-memory sources. + */ + ///@{ + CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); - /** Returns true if field is an integer */ - CONSTEXPR bool is_int() { - return (type() >= DataType::CSV_INT8) && (type() <= DataType::CSV_INT64); - } + /** Allows parsing stream sources such as `std::stringstream` or `std::ifstream` + * + * @tparam TStream An input stream deriving from `std::istream` + * @note Currently this constructor requires special CSV dialects to be manually + * specified. + */ + template::value, int> = 0> + CSVReader(TStream& source, CSVFormat format = CSVFormat()) : _format(format) { + using Parser = internals::StreamParser; - /** Returns true if field is a floating point value */ - CONSTEXPR bool is_float() { return type() == DataType::CSV_DOUBLE; }; + if (!format.col_names.empty()) + this->set_col_names(format.col_names); - /** Return the type of the underlying CSV data */ - CONSTEXPR DataType type() { - this->get_value(); - return _type; + this->parser = std::unique_ptr( + new Parser(source, format, col_names)); // For C++11 + this->initial_read(); } + ///@} - private: - long double value = 0; /**< Cached numeric value */ - csv::string_view sv = ""; /**< A pointer to this field's text */ - DataType _type = DataType::UNKNOWN; /**< Cached data type value */ - CONSTEXPR void get_value() { - /* Check to see if value has been cached previously, if not - * evaluate it - */ - if ((int)_type < 0) { - this->_type = internals::data_type(this->sv, &this->value); + CSVReader(const CSVReader&) = delete; // No copy constructor + CSVReader(CSVReader&&) = default; // Move constructor + CSVReader& operator=(const CSVReader&) = delete; // No copy assignment + CSVReader& operator=(CSVReader&& other) = default; + ~CSVReader() { + if (this->read_csv_worker.joinable()) { + this->read_csv_worker.join(); } } - }; - - /** Data structure for representing CSV rows */ - class CSVRow { - public: - friend BasicCSVParser; - - CSVRow() = default; - - /** Construct a CSVRow from a RawCSVDataPtr */ - CSVRow(RawCSVDataPtr _data) : data(_data) {} - /** Indicates whether row is empty or not */ - CONSTEXPR bool empty() const { return this->size() == 0; } + /** @name Retrieving CSV Rows */ + ///@{ + bool read_row(CSVRow &row); + iterator begin(); + HEDLEY_CONST iterator end() const noexcept; - /** Return the number of fields in this row */ - CONSTEXPR size_t size() const { return row_length; } + /** Returns true if we have reached end of file */ + bool eof() const noexcept { return this->parser->eof(); }; + ///@} - /** @name Value Retrieval */ + /** @name CSV Metadata */ ///@{ - CSVField operator[](size_t n) const; - CSVField operator[](const std::string&) const; - csv::string_view get_field(size_t index) const; - std::string to_json(const std::vector& subset = {}) const; - std::string to_json_array(const std::vector& subset = {}) const; - std::vector get_col_names() const { - return this->data->col_names->get_col_names(); - } - - /** Convert this CSVRow into a vector of strings. - * **Note**: This is a less efficient method of - * accessing data than using the [] operator. - */ - operator std::vector() const; + CSVFormat get_format() const; + std::vector get_col_names() const; + int index_of(csv::string_view col_name) const; ///@} - /** A random access iterator over the contents of a CSV row. - * Each iterator points to a CSVField. + /** @name CSV Metadata: Attributes */ + ///@{ + /** Whether or not the file or stream contains valid CSV rows, + * not including the header. + * + * @note Gives an accurate answer regardless of when it is called. + * */ - class iterator { - public: -#ifndef DOXYGEN_SHOULD_SKIP_THIS - using value_type = CSVField; - using difference_type = int; + CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; } - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - // but using shared_ptr as pointer type won't compile in g++ -#ifdef _MSC_BUILD - using pointer = std::shared_ptr; -#else - using pointer = CSVField * ; -#endif + /** Retrieves the number of rows that have been read so far */ + CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; } - using reference = CSVField & ; - using iterator_category = std::random_access_iterator_tag; -#endif - iterator(const CSVRow*, int i); + /** Whether or not CSV was prefixed with a UTF-8 bom */ + bool utf8_bom() const noexcept { return this->parser->utf8_bom(); } + ///@} - reference operator*() const; - pointer operator->() const; + protected: + /** + * \defgroup csv_internal CSV Parser Internals + * @brief Internals of CSVReader. Only maintainers and those looking to + * extend the parser should read this. + * @{ + */ - iterator operator++(int); - iterator& operator++(); - iterator operator--(int); - iterator& operator--(); - iterator operator+(difference_type n) const; - iterator operator-(difference_type n) const; + /** Sets this reader's column names and associated data */ + void set_col_names(const std::vector&); - /** Two iterators are equal if they point to the same field */ - constexpr bool operator==(const iterator& other) const { - return this->i == other.i; - }; + /** @name CSV Settings **/ + ///@{ + CSVFormat _format; + ///@} - constexpr bool operator!=(const iterator& other) const { return !operator==(other); } + /** @name Parser State */ + ///@{ + /** Pointer to a object containing column information */ + internals::ColNamesPtr col_names = std::make_shared(); -#ifndef NDEBUG - friend CSVRow; -#endif + /** Helper class which actually does the parsing */ + std::unique_ptr parser = nullptr; - private: - const CSVRow * daddy = nullptr; // Pointer to parent - std::shared_ptr field = nullptr; // Current field pointed at - int i = 0; // Index of current field - }; + /** Queue of parsed CSV rows */ + std::unique_ptr records{new RowCollection(100)}; - /** A reverse iterator over the contents of a CSVRow. */ - using reverse_iterator = std::reverse_iterator; + size_t n_cols = 0; /**< The number of columns in this CSV */ + size_t _n_rows = 0; /**< How many rows (minus header) have been read so far */ - /** @name Iterators - * @brief Each iterator points to a CSVField object. - */ - ///@{ - iterator begin() const; - iterator end() const; - reverse_iterator rbegin() const; - reverse_iterator rend() const; + /** @name Multi-Threaded File Reading Functions */ + ///@{ + bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE); ///@} + /**@}*/ + private: - RawCSVDataPtr data; + /** Whether or not rows before header were trimmed */ + bool header_trimmed = false; - /** Where in RawCSVData.data we start */ - size_t data_start = 0; + /** @name Multi-Threaded File Reading: Flags and State */ + ///@{ + std::thread read_csv_worker; /**< Worker thread for read_csv() */ + ///@} - /** Where in the RawCSVDataPtr.fields array we start */ - size_t field_bounds_index = 0; + /** Read initial chunk to get metadata */ + void initial_read() { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); + } - /** How many columns this row spans */ - size_t row_length = 0; + void trim_header(); }; +} -#ifdef _MSC_VER -#pragma region CSVField::get Specializations -#endif - /** Retrieve this field's original string */ - template<> - inline std::string CSVField::get() { - return std::string(this->sv); - } +/** @file + * Calculates statistics from CSV files + */ - /** Retrieve a view over this field's string +#include +#include +#include + +namespace csv { + /** Class for calculating statistics from CSV files and in-memory sources + * + * **Example** + * \include programs/csv_stats.cpp * - * @warning This string_view is only guaranteed to be valid as long as this - * CSVRow is still alive. */ - template<> - CONSTEXPR csv::string_view CSVField::get() { - return this->sv; - } + class CSVStat { + public: + using FreqCount = std::unordered_map; + using TypeCount = std::unordered_map; - /** Retrieve this field's value as a long double */ - template<> - CONSTEXPR long double CSVField::get() { - if (!is_num()) - throw std::runtime_error(internals::ERROR_NAN); + std::vector get_mean() const; + std::vector get_variance() const; + std::vector get_mins() const; + std::vector get_maxes() const; + std::vector get_counts() const; + std::vector get_dtypes() const; - return this->value; - } -#ifdef _MSC_VER -#pragma endregion CSVField::get Specializations -#endif + std::vector get_col_names() const { + return this->reader.get_col_names(); + } - /** Compares the contents of this field to a string */ - template<> - inline bool CSVField::operator==(const char * other) const - { - return this->sv == other; - } + CSVStat(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); + CSVStat(std::stringstream& source, CSVFormat format = CSVFormat()); + private: + // An array of rolling averages + // Each index corresponds to the rolling mean for the column at said index + std::vector rolling_means; + std::vector rolling_vars; + std::vector mins; + std::vector maxes; + std::vector counts; + std::vector dtypes; + std::vector n; - /** Compares the contents of this field to a string */ - template<> - inline bool CSVField::operator==(csv::string_view other) const - { - return this->sv == other; - } -} + // Statistic calculators + void variance(const long double&, const size_t&); + void count(CSVField&, const size_t&); + void min_max(const long double&, const size_t&); + void dtype(CSVField&, const size_t&); -inline std::ostream& operator << (std::ostream& os, csv::CSVField const& value) { - os << std::string(value); - return os; + void calc(); + void calc_chunk(); + void calc_worker(const size_t&); + + CSVReader reader; + std::deque records = {}; + }; } +#include +#include +#include namespace csv { - namespace internals { - /** @typedef ParseFlags - * An enum used for describing the significance of each character - * with respect to CSV parsing - */ - enum class ParseFlags { - NOT_SPECIAL, /**< Characters with no special meaning */ - QUOTE, /**< Characters which may signify a quote escape */ - DELIMITER, /**< Characters which may signify a new field */ - NEWLINE /**< Characters which may signify a new row */ - }; + /** Returned by get_file_info() */ + struct CSVFileInfo { + std::string filename; /**< Filename */ + std::vector col_names; /**< CSV column names */ + char delim; /**< Delimiting character */ + size_t n_rows; /**< Number of rows in a file */ + size_t n_cols; /**< Number of columns in a CSV */ + }; - using ParseFlagMap = std::array; - using WhitespaceMap = std::array; - } + /** @name Shorthand Parsing Functions + * @brief Convienience functions for parsing small strings + */ + ///@{ + CSVReader operator ""_csv(const char*, size_t); + CSVReader operator ""_csv_no_header(const char*, size_t); + CSVReader parse(csv::string_view in, CSVFormat format = CSVFormat()); + CSVReader parse_no_header(csv::string_view in); + ///@} - /** A class for parsing raw CSV data */ - class BasicCSVParser { - public: - BasicCSVParser() = default; - BasicCSVParser(internals::ColNamesPtr _col_names) : col_names(_col_names) {}; - BasicCSVParser(internals::ParseFlagMap parse_flags, internals::WhitespaceMap ws_flags) : - _parse_flags(parse_flags), _ws_flags(ws_flags) {}; + /** @name Utility Functions */ + ///@{ + std::unordered_map csv_data_types(const std::string&); + CSVFileInfo get_file_info(const std::string& filename); + int get_col_pos(csv::string_view filename, csv::string_view col_name, + const CSVFormat& format = CSVFormat::guess_csv()); + ///@} +} +/** @file + * A standalone header file for writing delimiter-separated files + */ - void parse(csv::string_view in, std::deque& records); - void end_feed(std::deque& records) { - using internals::ParseFlags; +#include +#include +#include +#include +#include +#include - bool empty_last_field = this->current_row.data - && !this->current_row.data->data.empty() - && parse_flag(this->current_row.data->data.back()) == ParseFlags::DELIMITER; - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } +namespace csv { + namespace internals { + static int DECIMAL_PLACES = 5; - if (this->current_row.size() > 0) { - this->push_row(records); - } + /** + * Calculate the absolute value of a number + */ + template + inline T csv_abs(T x) { + return abs(x); } - void set_parse_flags(internals::ParseFlagMap parse_flags) { - _parse_flags = parse_flags; + template<> + inline int csv_abs(int x) { + return abs(x); } - void set_ws_flags(internals::WhitespaceMap ws_flags) { - _ws_flags = ws_flags; + template<> + inline long int csv_abs(long int x) { + return labs(x); } - private: - CONSTEXPR internals::ParseFlags parse_flag(const char ch) const { - return _parse_flags.data()[ch + 128]; + template<> + inline long long int csv_abs(long long int x) { + return llabs(x); } - CONSTEXPR bool ws_flag(const char ch) const { - return _ws_flags.data()[ch + 128]; + template<> + inline float csv_abs(float x) { + return fabsf(x); } - void push_field(); - CONSTEXPR void parse_field(csv::string_view in, size_t& i, const size_t& current_row_start, bool quote_escape = false); - - void parse_loop(csv::string_view in); - - void push_row(std::deque& records) { - current_row.row_length = current_row.data->fields.size() - current_row.field_bounds_index; - records.push_back(std::move(current_row)); - }; - - void set_data_ptr(RawCSVDataPtr ptr) { - this->data_ptr = ptr; - this->fields = &(ptr->fields); + template<> + inline double csv_abs(double x) { + return fabs(x); } - /** An array where the (i + 128)th slot gives the ParseFlags for ASCII character i */ - internals::ParseFlagMap _parse_flags; + template<> + inline long double csv_abs(long double x) { + return fabsl(x); + } - /** An array where the (i + 128)th slot determines whether ASCII character i should - * be trimmed + /** + * Calculate the number of digits in a number */ - internals::WhitespaceMap _ws_flags; - - internals::ColNamesPtr col_names = nullptr; + template< + typename T, + csv::enable_if_t::value, int> = 0 + > + int num_digits(T x) + { + x = csv_abs(x); - CSVRow current_row; - int field_start = -1; - size_t field_length = 0; - bool field_has_double_quote = false; + int digits = 0; - RawCSVDataPtr data_ptr = nullptr; - internals::CSVFieldArray* fields = nullptr; + while (x >= 1) { + x /= 10; + digits++; + } - std::deque* _records = nullptr; - }; -} + return digits; + } -namespace csv { - namespace internals { - /** A string buffer and its size. Consumed by read_csv_worker(). */ - using WorkItem = std::pair, size_t>; + /** to_string() for unsigned integers */ + template::value, int> = 0> + inline std::string to_string(T value) { + std::string digits_reverse = ""; - /** Create a vector v where each index i corresponds to the - * ASCII number for a character and, v[i + 128] labels it according to - * the CSVReader::ParseFlags enum - */ - HEDLEY_CONST CONSTEXPR ParseFlagMap make_parse_flags(char delimiter) { - std::array ret = {}; - for (int i = -128; i < 128; i++) { - const int arr_idx = i + 128; - char ch = char(i); + if (value == 0) return "0"; - if (ch == delimiter) - ret[arr_idx] = ParseFlags::DELIMITER; - else if (ch == '\r' || ch == '\n') - ret[arr_idx] = ParseFlags::NEWLINE; - else - ret[arr_idx] = ParseFlags::NOT_SPECIAL; + while (value > 0) { + digits_reverse += (char)('0' + (value % 10)); + value /= 10; } - return ret; + return std::string(digits_reverse.rbegin(), digits_reverse.rend()); } - /** Create a vector v where each index i corresponds to the - * ASCII number for a character and, v[i + 128] labels it according to - * the CSVReader::ParseFlags enum - */ - HEDLEY_CONST CONSTEXPR ParseFlagMap make_parse_flags(char delimiter, char quote_char) { - std::array ret = make_parse_flags(delimiter); - ret[(size_t)quote_char + 128] = ParseFlags::QUOTE; - return ret; + /** to_string() for signed integers */ + template< + typename T, + csv::enable_if_t::value && std::is_signed::value, int> = 0 + > + inline std::string to_string(T value) { + if (value >= 0) + return to_string((size_t)value); + + return "-" + to_string((size_t)(value * -1)); } - /** Create a vector v where each index i corresponds to the - * ASCII number for a character c and, v[i + 128] is true if - * c is a whitespace character - */ - HEDLEY_CONST CONSTEXPR WhitespaceMap make_ws_flags(const char * ws_chars, size_t n_chars) { - std::array ret = {}; - for (int i = -128; i < 128; i++) { - const int arr_idx = i + 128; - char ch = char(i); - ret[arr_idx] = false; + /** to_string() for floating point numbers */ + template< + typename T, + csv::enable_if_t::value, int> = 0 + > + inline std::string to_string(T value) { +#ifdef __clang__ + return std::to_string(value); +#else + // TODO: Figure out why the below code doesn't work on clang + std::string result = ""; - for (size_t j = 0; j < n_chars; j++) { - if (ws_chars[j] == ch) { - ret[arr_idx] = true; + T integral_part; + T fractional_part = std::abs(std::modf(value, &integral_part)); + integral_part = std::abs(integral_part); + + // Integral part + if (value < 0) result = "-"; + + if (integral_part == 0) { + result += "0"; + } + else { + for (int n_digits = num_digits(integral_part); n_digits > 0; n_digits --) { + int digit = (int)(std::fmod(integral_part, pow10(n_digits)) / pow10(n_digits - 1)); + result += (char)('0' + digit); } } - } - return ret; + // Decimal part + result += "."; + + if (fractional_part > 0) { + fractional_part *= (T)(pow10(DECIMAL_PLACES)); + for (int n_digits = DECIMAL_PLACES; n_digits > 0; n_digits--) { + int digit = (int)(std::fmod(fractional_part, pow10(n_digits)) / pow10(n_digits - 1)); + result += (char)('0' + digit); + } + } + else { + result += "0"; + } + + return result; +#endif } + } - struct GuessScore { - double score; - size_t header; - }; + /** Sets how many places after the decimal will be written for floating point numbers + * + * @param precision Number of decimal places + */ +#ifndef __clang___ + inline static void set_decimal_places(int precision) { + internals::DECIMAL_PLACES = precision; + } +#endif + + /** @name CSV Writing */ + ///@{ + /** + * Class for writing delimiter separated values files + * + * To write formatted strings, one should + * -# Initialize a DelimWriter with respect to some output stream + * -# Call write_row() on std::vectors of unformatted text + * + * @tparam OutputStream The output stream, e.g. `std::ofstream`, `std::stringstream` + * @tparam Delim The delimiter character + * @tparam Quote The quote character + * @tparam Flush True: flush after every writing function, + * false: you need to flush explicitly if needed. + * In both cases the destructor will flush. + * + * @par Hint + * Use the aliases csv::CSVWriter to write CSV + * formatted strings and csv::TSVWriter + * to write tab separated strings + * + * @par Example w/ std::vector, std::deque, std::list + * @snippet test_write_csv.cpp CSV Writer Example + * + * @par Example w/ std::tuple + * @snippet test_write_csv.cpp CSV Writer Tuple Example + */ + template + class DelimWriter { + public: + /** Construct a DelimWriter over the specified output stream + * + * @param _out Stream to write to + * @param _quote_minimal Limit field quoting to only when necessary + */ - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format); + DelimWriter(OutputStream& _out, bool _quote_minimal = true) + : out(_out), quote_minimal(_quote_minimal) {}; - CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); + /** Construct a DelimWriter over the file + * + * @param[out] filename File to write to + */ + DelimWriter(const std::string& filename) : DelimWriter(std::ifstream(filename)) {}; - /** Read the first 500KB of a CSV file */ - CSV_INLINE std::string get_csv_head(csv::string_view filename); - } -} + /** Destructor will flush remaining data + * + */ + ~DelimWriter() { + out.flush(); + } -/** The all encompassing namespace */ -namespace csv { - /** Stuff that is generally not of interest to end-users */ - namespace internals { - std::string format_row(const std::vector& row, csv::string_view delim = ", "); + /** Format a sequence of strings and write to CSV according to RFC 4180 + * + * @warning This does not check to make sure row lengths are consistent + * + * @param[in] record Sequence of strings to be formatted + * + * @return The current DelimWriter instance (allowing for operator chaining) + */ + template + DelimWriter& operator<<(const std::array& record) { + for (size_t i = 0; i < Size; i++) { + out << csv_escape(record[i]); + if (i + 1 != Size) out << Delim; + } - std::vector _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv()); - } + end_out(); + return *this; + } - std::vector get_col_names( - csv::string_view filename, - const CSVFormat format = CSVFormat::guess_csv()); + /** @copydoc operator<< */ + template + DelimWriter& operator<<(const std::tuple& record) { + this->write_tuple<0, T...>(record); + return *this; + } - /** Guess the delimiter used by a delimiter-separated values file */ - CSVGuessResult guess_format(csv::string_view filename, - const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); + /** + * @tparam T A container such as std::vector, std::deque, or std::list + * + * @copydoc operator<< + */ + template< + typename T, typename Alloc, template class Container, + + // Avoid conflicting with tuples with two elements + csv::enable_if_t::value, int> = 0 + > + DelimWriter& operator<<(const Container& record) { + const size_t ilen = record.size(); + size_t i = 0; + for (const auto& field : record) { + out << csv_escape(field); + if (i + 1 != ilen) out << Delim; + i++; + } + + end_out(); + return *this; + } - /** @class CSVReader - * @brief Main class for parsing CSVs from files and in-memory sources - * - * All rows are compared to the column names for length consistency - * - By default, rows that are too short or too long are dropped - * - Custom behavior can be defined by overriding bad_row_handler in a subclass - */ - class CSVReader { - public: - /** - * An input iterator capable of handling large files. - * @note Created by CSVReader::begin() and CSVReader::end(). + /** Flushes the written data * - * @par Iterating over a file - * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 1 - * - * @par Using with `` library - * @snippet tests/test_csv_iterator.cpp CSVReader Iterator 2 */ - class iterator { - public: - #ifndef DOXYGEN_SHOULD_SKIP_THIS - using value_type = CSVRow; - using difference_type = std::ptrdiff_t; - using pointer = CSVRow * ; - using reference = CSVRow & ; - using iterator_category = std::input_iterator_tag; - #endif + void flush() { + out.flush(); + } - iterator() = default; - iterator(CSVReader* reader) : daddy(reader) {}; - iterator(CSVReader*, CSVRow&&); + private: + template< + typename T, + csv::enable_if_t< + !std::is_convertible::value + && !std::is_convertible::value + , int> = 0 + > + std::string csv_escape(T in) { + return internals::to_string(in); + } - /** Access the CSVRow held by the iterator */ - CONSTEXPR reference operator*() { return this->row; } + template< + typename T, + csv::enable_if_t< + std::is_convertible::value + || std::is_convertible::value + , int> = 0 + > + std::string csv_escape(T in) { + IF_CONSTEXPR(std::is_convertible::value) { + return _csv_escape(in); + } + + return _csv_escape(std::string(in)); + } - /** Return a pointer to the CSVRow the iterator has stopped at */ - CONSTEXPR pointer operator->() { return &(this->row); } + std::string _csv_escape(csv::string_view in) { + /** Format a string to be RFC 4180-compliant + * @param[in] in String to be CSV-formatted + * @param[out] quote_minimal Only quote fields if necessary. + * If False, everything is quoted. + */ - iterator& operator++(); /**< Pre-increment iterator */ - iterator operator++(int); /**< Post-increment ierator */ - iterator& operator--(); + // Do we need a quote escape + bool quote_escape = false; - /** Returns true if iterators were constructed from the same CSVReader - * and point to the same row - */ - CONSTEXPR bool operator==(const iterator& other) const { - return (this->daddy == other.daddy) && (this->i == other.i); + for (auto ch : in) { + if (ch == Quote || ch == Delim || ch == '\r' || ch == '\n') { + quote_escape = true; + break; + } } - CONSTEXPR bool operator!=(const iterator& other) const { return !operator==(other); } - private: - CSVReader * daddy = nullptr; // Pointer to parent - CSVRow row; // Current row - RowCount i = 0; // Index of current row - }; + if (!quote_escape) { + if (quote_minimal) return std::string(in); + else { + std::string ret(1, Quote); + ret += in.data(); + ret += Quote; + return ret; + } + } - /** @name Constructors - * Constructors for iterating over large files and parsing in-memory sources. - */ - ///@{ - CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); - CSVReader(CSVFormat format = CSVFormat()); - ///@} + // Start initial quote escape sequence + std::string ret(1, Quote); + for (auto ch: in) { + if (ch == Quote) ret += std::string(2, Quote); + else ret += ch; + } - CSVReader(const CSVReader&) = delete; // No copy constructor - CSVReader(CSVReader&&) = default; // Move constructor - CSVReader& operator=(const CSVReader&) = delete; // No copy assignment - CSVReader& operator=(CSVReader&& other) = default; + // Finish off quote escape + ret += Quote; + return ret; + } - /** @name Reading In-Memory Strings - * You can piece together incomplete CSV fragments by calling feed() on them - * before finally calling end_feed(). - * - * Alternatively, you can also use the parse() shorthand function for - * smaller strings. - */ - ///@{ - void feed(csv::string_view in); - void end_feed(); - ///@} + /** Recurisve template for writing std::tuples */ + template + typename std::enable_if::type write_tuple(const std::tuple& record) { + out << csv_escape(std::get(record)); - /** @name Retrieving CSV Rows */ - ///@{ - bool read_row(CSVRow &row); - iterator begin(); - HEDLEY_CONST iterator end() const; - ///@} + IF_CONSTEXPR (Index + 1 < sizeof...(T)) out << Delim; - /** @name CSV Metadata */ - ///@{ - CSVFormat get_format() const; - std::vector get_col_names() const; - int index_of(csv::string_view col_name) const; - ///@} + this->write_tuple(record); + } - /** @name CSV Metadata: Attributes */ - ///@{ - bool empty() const { return this->size() == 0; } - RowCount size() const { return this->num_rows; } - bool utf8_bom() const { return this->_utf8_bom; } - ///@} + /** Base case for writing std::tuples */ + template + typename std::enable_if::type write_tuple(const std::tuple& record) { + (void)record; + end_out(); + } - protected: - /** - * \defgroup csv_internal CSV Parser Internals - * @brief Internals of CSVReader. Only maintainers and those looking to - * extend the parser should read this. - * @{ - */ + /** Ends a line in 'out' and flushes, if Flush is true.*/ + void end_out() { + out << '\n'; + IF_CONSTEXPR(Flush) out.flush(); + } - /** Multi-threaded Reading State, including synchronization objects that cannot be moved. */ - struct ThreadedReadingState { - std::deque feed_buffer; /**< Message queue for worker */ - std::mutex feed_lock; /**< Allow only one worker to write */ - std::condition_variable feed_cond; /**< Wake up worker */ - }; + OutputStream & out; + bool quote_minimal; + }; - /** Open a file for reading. */ - void fopen(csv::string_view filename); + /** An alias for csv::DelimWriter for writing standard CSV files + * + * @sa csv::DelimWriter::operator<<() + * + * @note Use `csv::make_csv_writer()` to in instatiate this class over + * an actual output stream. + */ + template + using CSVWriter = DelimWriter; - size_t file_size; + /** Class for writing tab-separated values files + * + * @sa csv::DelimWriter::write_row() + * @sa csv::DelimWriter::operator<<() + * + * @note Use `csv::make_tsv_writer()` to in instatiate this class over + * an actual output stream. + */ + template + using TSVWriter = DelimWriter; - /** Sets this reader's column names and associated data */ - void set_col_names(const std::vector&); + /** Return a csv::CSVWriter over the output stream */ + template + inline CSVWriter make_csv_writer(OutputStream& out, bool quote_minimal=true) { + return CSVWriter(out, quote_minimal); + } - /** Returns true if we have reached end of file */ - bool eof() { return this->csv_mmap_eof; }; + /** Return a buffered csv::CSVWriter over the output stream (does not auto flush) */ + template + inline CSVWriter make_csv_writer_buffered(OutputStream& out, bool quote_minimal=true) { + return CSVWriter(out, quote_minimal); + } - /** @name CSV Settings **/ - ///@{ - CSVFormat _format; - ///@} + /** Return a csv::TSVWriter over the output stream */ + template + inline TSVWriter make_tsv_writer(OutputStream& out, bool quote_minimal=true) { + return TSVWriter(out, quote_minimal); + } - /** @name Parser State */ - ///@{ - /** Pointer to a object containing column information */ - internals::ColNamesPtr col_names = std::make_shared(); + /** Return a buffered csv::TSVWriter over the output stream (does not auto flush) */ + template + inline TSVWriter make_tsv_writer_buffered(OutputStream& out, bool quote_minimal=true) { + return TSVWriter(out, quote_minimal); + } + ///@} +} - // TODO: Update description - /** Buffer for current row being parsed */ - BasicCSVParser parser = BasicCSVParser(this->col_names); - /** Queue of parsed CSV rows */ - std::deque records; +namespace csv { + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); - /** Whether or not an attempt to find Unicode BOM has been made */ - bool unicode_bom_scan = false; + return end - start; + } - /** Whether or not rows before header were trimmed */ - bool header_trimmed = false; + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); + } - /** The number of columns in this CSV */ - size_t n_cols = 0; + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; - /** How many rows (minus header) have been parsed so far */ - RowCount num_rows = 0; + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); - /** Set to true if UTF-8 BOM was detected */ - bool _utf8_bom = false; - ///@} + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } - /** @name Multi-Threaded File Reading Functions */ - ///@{ - void feed(internals::WorkItem&&); /**< @brief Helper for read_csv_worker() */ - void read_csv(const size_t& bytes = internals::ITERATION_CHUNK_SIZE); + return std::string(mmap.begin(), mmap.end()); + } - size_t relative_mmap_pos = 0; +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); + } + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); + } - void read_csv_worker(); - std::string _filename = ""; - ///@} + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); + } - /** @name Multi-Threaded File Reading: Flags and State */ - ///@{ - mio::mmap_source csv_mmap; - bool csv_mmap_eof = true; - size_t csv_mmap_pos = 0; - std::unique_ptr feed_state; - ///@} + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; - /**@}*/ // End of parser internals + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER + || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); - private: - /** Set parse and whitespace flags */ - void set_parse_flags(const CSVFormat& format); - }; -} -/** @file - * Calculates statistics from CSV files - */ + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } -#include -#include + // Push row + if (this->current_row.size() > 0) + this->push_row(); + } -namespace csv { - /** Class for calculating statistics from CSV files and in-memory sources - * - * **Example** - * \include programs/csv_stats.cpp - * - */ - class CSVStat : public CSVReader { - public: - using FreqCount = std::unordered_map; - using TypeCount = std::unordered_map; + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; - void end_feed(); - std::vector get_mean() const; - std::vector get_variance() const; - std::vector get_mins() const; - std::vector get_maxes() const; - std::vector get_counts() const; - std::vector get_dtypes() const; + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; - CSVStat(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv()); - CSVStat(CSVFormat format = CSVFormat()) : CSVReader(format) {}; - private: - // An array of rolling averages - // Each index corresponds to the rolling mean for the column at said index - std::vector rolling_means; - std::vector rolling_vars; - std::vector mins; - std::vector maxes; - std::vector counts; - std::vector dtypes; - std::vector n; + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); - // Statistic calculators - void variance(const long double&, const size_t&); - void count(CSVField&, const size_t&); - void min_max(const long double&, const size_t&); - void dtype(CSVField&, const size_t&); + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; - void calc(); - void calc_worker(const size_t&); - }; -} + field_length = data_pos - (field_start + current_row_start()); -#include -#include -#include + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; + } -namespace csv { - /** Returned by get_file_info() */ - struct CSVFileInfo { - std::string filename; /**< Filename */ - std::vector col_names; /**< CSV column names */ - char delim; /**< Delimiting character */ - RowCount n_rows; /**< Number of rows in a file */ - int n_cols; /**< Number of columns in a CSV */ - }; + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; - /** @name Shorthand Parsing Functions - * @brief Convienience functions for parsing small strings - */ - ///@{ - CSVReader operator ""_csv(const char*, size_t); - CSVReader parse(csv::string_view in, CSVFormat format = CSVFormat()); - ///@} + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } - /** @name Utility Functions */ - ///@{ - std::unordered_map csv_data_types(const std::string&); - CSVFileInfo get_file_info(const std::string& filename); - int get_col_pos(const std::string filename, const std::string col_name, - const CSVFormat format = CSVFormat::guess_csv()); - ///@} -} -/** @file - * A standalone header file for writing delimiter-separated files - */ + current_row.row_length++; + + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; + } + + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); + + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; + break; + + case ParseFlags::NEWLINE: + this->data_pos++; + + // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) + while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; + + // End of record -> Write record + this->push_field(); + this->push_row(); -#include -#include -#include -#include + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; -namespace csv { - /** @name CSV Writing */ - ///@{ - #ifndef DOXYGEN_SHOULD_SKIP_THIS - template - inline std::string csv_escape(csv::string_view in, const bool quote_minimal = true) { - /** Format a string to be RFC 4180-compliant - * @param[in] in String to be CSV-formatted - * @param[out] quote_minimal Only quote fields if necessary. - * If False, everything is quoted. - */ + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; + + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } + } + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; - // Sequence used for escaping quote characters that appear in text - constexpr char double_quote[3] = { Quote, Quote }; + break; - std::string new_string; - bool quote_escape = false; // Do we need a quote escape - new_string += Quote; // Start initial quote escape sequence + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; + } - for (size_t i = 0; i < in.size(); i++) { - switch (in[i]) { - case Quote: - new_string += double_quote; - quote_escape = true; - break; - case Delim: - quote_escape = true; - HEDLEY_FALL_THROUGH; - default: - new_string += in[i]; + // Case: Unescaped quote + this->field_length++; + data_pos++; + + break; + } } + + return this->current_row_start(); } - if (quote_escape || !quote_minimal) { - new_string += Quote; // Finish off quote escape - return new_string; + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); } - return std::string(in); - } - #endif + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } - /** - * Class for writing delimiter separated values files - * - * To write formatted strings, one should - * -# Initialize a DelimWriter with respect to some output stream - * -# Call write_row() on std::vectors of unformatted text - * - * @tparam OutputStream The output stream, e.g. `std::ofstream`, `std::stringstream` - * @tparam Delim The delimiter character - * @tparam Quote The quote character - * - * @par Hint - * Use the aliases csv::CSVWriter to write CSV - * formatted strings and csv::TSVWriter - * to write tab separated strings - * - * @par Example - * @snippet test_write_csv.cpp CSV Writer Example - */ - template - class DelimWriter { - public: - /** Construct a DelimWriter over the specified output stream */ - DelimWriter(OutputStream& _out) : out(_out) {}; + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; - /** Construct a DelimWriter over the file - * - * @param[out] filename File to write to - */ - DelimWriter(const std::string& filename) : DelimWriter(std::ifstream(filename)) {}; + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } - /** Format a sequence of strings and write to CSV according to RFC 4180 - * - * @warning This does not check to make sure row lengths are consistent - * - * @param[in] record Sequence of strings to be formatted - * @param quote_minimal Only quote fields if necessary - */ - template class Container> - void write_row(const Container& record, bool quote_minimal = true) { - const size_t ilen = record.size(); - size_t i = 0; - for (auto& field: record) { - out << csv_escape(field, quote_minimal); - if (i + 1 != ilen) out << Delim; - i++; + this->unicode_bom_scan = true; } - - out << std::endl; } +#ifdef _MSC_VER +#pragma endregion +#endif - /** @copydoc write_row - * @return The current DelimWriter instance (allowing for operator chaining) - */ - template class Container> - DelimWriter& operator<<(const Container& record) { - this->write_row(record); - return *this; - } +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); - private: - OutputStream & out; - }; + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; - /* Uncomment when C++17 support is better - template - DelimWriter(OutputStream&) -> DelimWriter; - */ + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); - /** Class for writing CSV files - * - * @sa csv::DelimWriter::write_row() - * @sa csv::DelimWriter::operator<<() - * - * @note Use `csv::make_csv_writer()` to in instatiate this class over - * an actual output stream. - */ - template - using CSVWriter = DelimWriter; + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); - /** Class for writing tab-separated values files -* - * @sa csv::DelimWriter::write_row() - * @sa csv::DelimWriter::operator<<() - * - * @note Use `csv::make_tsv_writer()` to in instatiate this class over - * an actual output stream. - */ - template - using TSVWriter = DelimWriter; + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); - // - // Temporary: Until more C++17 compilers support template deduction guides - // - template - inline CSVWriter make_csv_writer(OutputStream& out) { - /** Return a CSVWriter over the output stream */ - return CSVWriter(out); - } + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } - template - inline TSVWriter make_tsv_writer(OutputStream& out) { - /** Return a TSVWriter over the output stream */ - return TSVWriter(out); + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif } - - ///@} } @@ -6366,7 +7229,7 @@ namespace csv { return CSV_NOT_FOUND; } - CSV_INLINE size_t ColNames::size() const { + CSV_INLINE size_t ColNames::size() const noexcept { return this->col_names.size(); } @@ -6413,6 +7276,8 @@ namespace csv { } CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + this->header = row; this->col_names = {}; return *this; @@ -6465,10 +7330,6 @@ namespace csv { * @brief Defines functionality needed for basic CSV parsing */ -#include -#include // For read_csv() -#include -#include namespace csv { namespace internals { @@ -6478,8 +7339,9 @@ namespace csv { for (size_t i = 0; i < row.size(); i++) { ret << row[i]; if (i + 1 < row.size()) ret << delim; - else ret << std::endl; + else ret << '\n'; } + ret.flush(); return ret.str(); } @@ -6491,24 +7353,94 @@ namespace csv { * */ CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - auto parse_flags = internals::make_parse_flags(format.get_delim()); - if (format.is_quoting_enabled()) { - parse_flags = internals::make_parse_flags(format.get_delim(), format.get_quote_char()); - } - // Parse the CSV auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; - BasicCSVParser parser( - parse_flags, - internals::make_ws_flags(trim_chars.data(), trim_chars.size()) - ); - - std::deque rows; - parser.parse(head, rows); + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); return CSVRow(std::move(rows[format.get_header()])); } + + CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; + + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; + + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; + + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; + + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; + } + } + } + + double final_score = 0; + size_t header_row = 0; + + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { + final_score = score; + header_row = row_when[row_size]; + } + } + + return { + final_score, + header_row + }; + } + + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ + + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; + + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); + + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } + } + + return { current_delim, (int)header }; + } } /** Return a CSV's column names @@ -6535,23 +7467,10 @@ namespace csv { return internals::_guess_format(head, delims); } - /** Allows parsing in-memory sources (by calling feed() and end_feed()). */ - CSV_INLINE CSVReader::CSVReader(CSVFormat format) : - unicode_bom_scan(!format.unicode_detect), feed_state(new ThreadedReadingState) { - if (!format.col_names.empty()) { - this->set_col_names(format.col_names); - } - - this->set_parse_flags(format); - } - - /** Allows reading a CSV file in chunks, using overlapped - * threads for simulatenously reading from disk and parsing. - * Rows should be retrieved with read_row() or by using - * CSVReader::iterator. + /** Reads an arbitrarily large CSV file using memory-mapped IO. * - * **Details:** Reads the first 500kB of a CSV file to infer file information - * such as column names and delimiting character. + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. * * @param[in] filename Path to CSV file * @param[in] format Format of the CSV file @@ -6559,25 +7478,23 @@ namespace csv { * \snippet tests/test_read_csv.cpp CSVField Example * */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : feed_state(new ThreadedReadingState) { + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; /** Guess delimiter and header row */ if (format.guess_delim()) { auto guess_result = internals::_guess_format(head, format.possible_delimiters); format.delimiter(guess_result.delim); format.header = guess_result.header_row; + this->_format = format; } - if (format.col_names.empty()) { - this->set_col_names(internals::_get_col_names(head, format)); - } - else { + if (!format.col_names.empty()) this->set_col_names(format.col_names); - } - this->set_parse_flags(format); - this->fopen(filename); + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } /** Return the format of the original raw CSV */ @@ -6613,116 +7530,21 @@ namespace csv { return CSV_NOT_FOUND; } - CSV_INLINE void CSVReader::feed(internals::WorkItem&& buff) { - this->feed( csv::string_view(buff.first.get(), buff.second) ); - } - - /** Parse a CSV-formatted string. - * - * @par Usage - * Incomplete CSV fragments can be joined together by calling feed() on them sequentially. - * - * @note - * `end_feed()` should be called after the last string. - */ - CSV_INLINE void CSVReader::feed(csv::string_view in) { - if (in.empty()) return; - - /** Handle possible Unicode byte order mark */ - if (!this->unicode_bom_scan) { - if (in[0] == '\xEF' && in[1] == '\xBB' && in[2] == '\xBF') { - in.remove_prefix(3); // Remove BOM from input string - this->_utf8_bom = true; - } - - this->unicode_bom_scan = true; - } - - this->parser.parse(in, this->records); - + CSV_INLINE void CSVReader::trim_header() { if (!this->header_trimmed) { - for (int i = 0; i <= this->_format.header && !this->records.empty(); i++) { + for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { if (i == this->_format.header && this->col_names->empty()) { - this->set_col_names(CSVRow(std::move(this->records.front()))); + this->set_col_names(this->records->pop_front()); + } + else { + this->records->pop_front(); } - - this->records.pop_front(); } this->header_trimmed = true; } } - CSV_INLINE void CSVReader::end_feed() { - /** Indicate that there is no more data to receive, - * and handle the last row - */ - this->parser.end_feed(this->records); - } - - /** Worker thread for read_csv() which parses CSV rows (while the main - * thread pulls data from disk) - */ - CSV_INLINE void CSVReader::read_csv_worker() { - while (true) { - std::unique_lock lock{ this->feed_state->feed_lock }; // Get lock - this->feed_state->feed_cond.wait(lock, // Wait - [this] { return !(this->feed_state->feed_buffer.empty()); }); - - // Wake-up - auto in = std::move(this->feed_state->feed_buffer.front()); - this->feed_state->feed_buffer.pop_front(); - - // Nullptr --> Die - if (!in.first) break; - - lock.unlock(); // Release lock - this->feed(std::move(in)); - } - } - - CSV_INLINE void CSVReader::set_parse_flags(const CSVFormat& format) - { - this->_format = format; - if (format.no_quote) { - this->parser.set_parse_flags(internals::make_parse_flags(format.get_delim())); - } - else { - this->parser.set_parse_flags(internals::make_parse_flags(format.get_delim(), format.quote_char)); - } - - this->parser.set_ws_flags(internals::make_ws_flags(format.trim_chars.data(), format.trim_chars.size())); - } - - CSV_INLINE void CSVReader::fopen(csv::string_view filename) { - this->_filename = filename; - - if (!this->csv_mmap.is_open()) { - this->csv_mmap_eof = false; - std::ifstream infile(_filename, std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - this->file_size = end - start; - - std::error_code error; - - if (internals::get_available_memory() > this->file_size * 2) { - this->csv_mmap.map(filename, error); - } - else { - this->csv_mmap.map(filename, 0, - std::min((size_t)csv::internals::ITERATION_CHUNK_SIZE, this->file_size), - error - ); - } - - if (error) { - throw error; - } - } - } - /** * @param[in] names Column names */ @@ -6733,83 +7555,38 @@ namespace csv { } /** - * Parse a CSV file using multiple threads - * - * @pre CSVReader::infile points to a valid file handle, i.e. CSVReader::fopen was called + * Read a chunk of CSV data. * - * @param[in] bytes Number of bytes to read. - * @see CSVReader::read_row() - */ - CSV_INLINE void CSVReader::read_csv(const size_t& bytes) { - if (this->_filename.empty()) { - return; - } - - const size_t BUFFER_UPPER_LIMIT = std::min(bytes, (size_t)1000000); - std::unique_ptr buffer(new char[BUFFER_UPPER_LIMIT]); - auto * HEDLEY_RESTRICT line_buffer = buffer.get(); - line_buffer[0] = '\0'; - - std::thread worker(&CSVReader::read_csv_worker, this); - - size_t strlen = 0; - for (size_t processed = 0; this->csv_mmap_pos < this->file_size && processed < bytes; this->csv_mmap_pos++) { - if (this->relative_mmap_pos == this->csv_mmap.length()) { - std::error_code error; - - size_t length = std::min(this->file_size - this->csv_mmap_pos, csv::internals::ITERATION_CHUNK_SIZE); - this->csv_mmap = mio::make_mmap_source(this->_filename, this->csv_mmap_pos, - length, - error - ); - - if (error) { - throw error; - } - - this->relative_mmap_pos = 0; - } - - line_buffer[strlen] = this->csv_mmap[this->relative_mmap_pos]; - strlen++; - this->relative_mmap_pos++; - - if (strlen == BUFFER_UPPER_LIMIT - 1) { - processed += strlen; - line_buffer[strlen] = '\0'; - - std::unique_lock lock{ this->feed_state->feed_lock }; - - this->feed_state->feed_buffer.push_back(std::make_pair<>(std::move(buffer), strlen)); + * @note This method is meant to be run on its own thread. Only one `read_csv()` thread + * should be active at a time. + * + * @param[in] bytes Number of bytes to read. + * + * @see CSVReader::read_csv_worker + * @see CSVReader::read_row() + */ + CSV_INLINE bool CSVReader::read_csv(size_t bytes) { + // Tell read_row() to listen for CSV rows + this->records->notify_all(); - buffer = std::unique_ptr(new char[BUFFER_UPPER_LIMIT]); // New pointer - line_buffer = buffer.get(); - line_buffer[0] = '\0'; - strlen = 0; + this->parser->set_output(*this->records); + this->parser->next(bytes); - this->feed_state->feed_cond.notify_one(); - } + if (!this->header_trimmed) { + this->trim_header(); } - // Feed remaining bits - std::unique_lock lock{ this->feed_state->feed_lock }; - this->feed_state->feed_buffer.push_back(std::make_pair<>(std::move(buffer), strlen)); - this->feed_state->feed_buffer.push_back(std::make_pair<>(nullptr, 0)); // Termination signal - this->feed_state->feed_cond.notify_one(); - lock.unlock(); - worker.join(); - - if (this->csv_mmap_pos == this->csv_mmap.length()) { - this->csv_mmap_eof = true; - this->end_feed(); - } + // Tell read_row() to stop waiting + this->records->kill_all(); + + return true; } /** * Retrieve rows as CSVRow objects, returning true if more rows are available. * - * **Performance Notes**: - * - The number of rows read in at a time is determined by csv::ITERATION_CHUNK_SIZE + * @par Performance Notes + * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time * - For performance details, read the documentation for CSVRow and CSVField. * * @param[out] row The variable where the parsed row will be stored @@ -6820,33 +7597,36 @@ namespace csv { * */ CSV_INLINE bool CSVReader::read_row(CSVRow &row) { - if (this->records.empty()) { - if (!this->eof()) { - this->read_csv(internals::ITERATION_CHUNK_SIZE); - } - else return false; // Stop reading - } + while (true) { + if (this->records->empty()) { + if (this->records->is_waitable()) + // Reading thread is currently active => wait for it to populate records + this->records->wait(); + else if (this->parser->eof()) + // End of file and no more records + return false; + else { + // Reading thread is not active => start another one + if (this->read_csv_worker.joinable()) + this->read_csv_worker.join(); - while (!this->records.empty()) { - if (this->records.front().size() != this->n_cols && + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + } + } + else if (this->records->front().size() != this->n_cols && this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { + auto errored_row = this->records->pop_front(); + if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { - auto errored_row = std::move(this->records.front()); - if (this->records.front().size() < this->n_cols) { + if (errored_row.size() < this->n_cols) throw std::runtime_error("Line too short " + internals::format_row(errored_row)); - } throw std::runtime_error("Line too long " + internals::format_row(errored_row)); } - - // Silently drop row (default) - this->records.pop_front(); } else { - row = std::move(this->records.front()); - - this->num_rows++; - this->records.pop_front(); + row = this->records->pop_front(); + this->_n_rows++; return true; } } @@ -6855,105 +7635,6 @@ namespace csv { } } -#include - -namespace csv { - namespace internals { - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; - - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; - - // Parse the CSV - BasicCSVParser parser( - internals::make_parse_flags(format.get_delim(), '"'), - internals::make_ws_flags({}, 0) - ); - - std::deque rows; - parser.parse(head, rows); - - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; - - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } - } - - double final_score = 0; - size_t header_row = 0; - - // Final score is equal to the largest - // row size times rows of that size - for (auto& [row_size, row_count] : row_tally) { - double score = (double)(row_size * row_count); - if (score > final_score) { - final_score = score; - header_row = row_when[row_size]; - } - } - - return { - final_score, - header_row - }; - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ - - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; - - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); - - if (result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } - } - - return { current_delim, (int)header }; - } - - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - const size_t bytes = 500000; - std::ifstream infile(filename.data()); - if (!infile.is_open()) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } - - std::unique_ptr buffer(new char[bytes + 1]); - char * head_buffer = buffer.get(); - - for (size_t i = 0; i < bytes + 1; i++) { - head_buffer[i] = '\0'; - } - - infile.read(head_buffer, bytes); - return std::string(head_buffer); - } - } -} /** @file * Defines an input iterator for csv::CSVReader */ @@ -6962,25 +7643,23 @@ namespace csv { namespace csv { /** Return an iterator to the first row in the reader */ CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records.empty()) { - this->read_csv(); + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); // Still empty => return end iterator - if (this->records.empty()) { - return this->end(); - } + if (this->records->empty()) return this->end(); } - CSVReader::iterator ret(this, std::move(this->records.front())); - - this->records.pop_front(); + this->_n_rows++; + CSVReader::iterator ret(this, this->records->pop_front()); return ret; } /** A placeholder for the imaginary past the end row in a CSV. * Attempting to deference this will lead to bad things. */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const { + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { return CSVReader::iterator(); } @@ -6995,7 +7674,10 @@ namespace csv { /** Advance the iterator by one row. If this CSVReader has an * associated file, then the iterator will lazily pull more data from - * that file until EOF. + * that file until the end of file is reached. + * + * @note This iterator does **not** block the thread responsible for parsing CSV. + * */ CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { if (!daddy->read_row(this->row)) { @@ -7015,6 +7697,7 @@ namespace csv { return temp; } } + /** @file * Defines the data type used for storing information about a CSV row */ @@ -7024,10 +7707,17 @@ namespace csv { namespace csv { namespace internals { - CSV_INLINE void CSVFieldArray::allocate() { - RawCSVField * buffer = new RawCSVField[single_buffer_capacity]; - buffers.push_back(buffer); + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; + } + + CSV_INLINE void CSVFieldList::allocate() { + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + _current_buffer_size = 0; + _back = buffers.back().get(); } } @@ -7064,7 +7754,6 @@ namespace csv { } CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; for (size_t i = 0; i < size(); i++) ret.push_back(std::string(this->get_field(i))); @@ -7074,23 +7763,21 @@ namespace csv { CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const { - if (index >= this->size()) { - throw std::runtime_error("Index out of bounds."); - } + using internals::ParseFlags; - size_t field_index = this->field_bounds_index + index; - const RawCSVField& raw_field = this->data->fields[field_index]; - bool has_doubled_quote = this->data->has_double_quotes.find(field_index) != this->data->has_double_quotes.end(); + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); - csv::string_view csv_field = csv::string_view(this->data->data).substr(this->data_start + raw_field.start); + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - if (has_doubled_quote) { - std::string& ret = this->data->double_quote_fields[field_index]; - if (ret.empty()) { + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { bool prev_ch_quote = false; - for (size_t i = 0; i < raw_field.length; i++) { - // TODO: Use parse flags - if (csv_field[i] == '"') { + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { if (prev_ch_quote) { prev_ch_quote = false; continue; @@ -7100,14 +7787,101 @@ namespace csv { } } - ret += csv_field[i]; + value += field_str[i]; } } - return csv::string_view(ret); + return csv::string_view(value); + } + + return field_str.substr(0, field.length); + } + + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; + + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); + + int value_ = 0; + + size_t digits = (end - start); + size_t base16_exponent = digits - 1; + + if (digits == 0) return false; + + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; + + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } + + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; + } + + parsedValue = value_; + return true; + } + + CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { + // If field has already been parsed to empty, no need to do it aagin: + if (this->_type == DataType::CSV_NULL) + return false; + + // Not yet parsed or possibly parsed with other decimalSymbol + if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) + this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again + + // Integral types are not affected by decimalSymbol and need not be parsed again + + // Either we already had an integral type before, or we we just got any numeric type now. + if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { + dVal = this->value; + return true; } - return csv_field.substr(0, raw_field.length); + // CSV_NULL or CSV_STRING, not numeric + return false; } #ifdef _MSC_VER @@ -7123,11 +7897,11 @@ namespace csv { * @warning Attempting to dereference the end iterator results * in dereferencing a null pointer. */ - CSV_INLINE CSVRow::iterator CSVRow::end() const { + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { return CSVRow::iterator(this, (int)this->size()); } - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const { + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { return std::reverse_iterator(this->end()); } @@ -7150,12 +7924,7 @@ namespace csv { } CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - #ifdef _MSC_BUILD return this->field; - #else - return this->field.get(); - #endif } CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { @@ -7190,7 +7959,7 @@ namespace csv { this->operator--(); return temp; } - + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { // Allows for iterator arithmetic return CSVRow::iterator(this->daddy, i + (int)n); @@ -7219,9 +7988,9 @@ namespace csv { The code is licensed under the [MIT License](http://opensource.org/licenses/MIT): - + Copyright © 2013-2015 Niels Lohmann. - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, @@ -7229,10 +7998,10 @@ namespace csv { publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -7300,7 +8069,8 @@ namespace csv { } // create a result string of necessary size - std::string result(s.size() + space, '\\'); + size_t result_size = s.size() + space; + std::string result(result_size, '\\'); std::size_t pos = 0; for (const auto& c : s) @@ -7375,7 +8145,7 @@ namespace csv { if (c >= 0x00 && c <= 0x1f) { // print character c as \uxxxx - sprintf(&result[pos + 1], "u%04x", int(c)); + snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); pos += 6; // overwrite trailing null character result[pos] = '\\'; @@ -7409,7 +8179,7 @@ namespace csv { const size_t _n_cols = col_names.size(); std::string ret = "{"; - + for (size_t i = 0; i < _n_cols; i++) { auto& col = col_names[i]; auto field = this->operator[](col); @@ -7465,6 +8235,7 @@ namespace csv { return ret; } } + /** @file * Calculates statistics from CSV files */ @@ -7472,30 +8243,25 @@ namespace csv { #include namespace csv { + /** Calculate statistics for an arbitrarily large file. When this constructor + * is called, CSVStat will process the entire file iteratively. Once finished, + * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. + */ CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : - CSVReader(filename, format) { - /** Lazily calculate statistics for a potentially large file. Once this constructor - * is called, CSVStat will process the entire file iteratively. Once finished, - * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. - */ - while (!this->eof()) { - this->read_csv(internals::ITERATION_CHUNK_SIZE); - this->calc(); - } - - if (!this->records.empty()) - this->calc(); + reader(filename, format) { + this->calc(); } - CSV_INLINE void CSVStat::end_feed() { - CSVReader::end_feed(); + /** Calculate statistics for a CSV stored in a std::stringstream */ + CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : + reader(stream, format) { this->calc(); } /** Return current means */ CSV_INLINE std::vector CSVStat::get_mean() const { - std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->rolling_means[i]); } return ret; @@ -7503,8 +8269,8 @@ namespace csv { /** Return current variances */ CSV_INLINE std::vector CSVStat::get_variance() const { - std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); } return ret; @@ -7512,8 +8278,8 @@ namespace csv { /** Return current mins */ CSV_INLINE std::vector CSVStat::get_mins() const { - std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->mins[i]); } return ret; @@ -7521,8 +8287,8 @@ namespace csv { /** Return current maxes */ CSV_INLINE std::vector CSVStat::get_maxes() const { - std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->maxes[i]); } return ret; @@ -7531,7 +8297,7 @@ namespace csv { /** Get counts for each column */ CSV_INLINE std::vector CSVStat::get_counts() const { std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->counts[i]); } return ret; @@ -7539,48 +8305,67 @@ namespace csv { /** Get data type counts for each column */ CSV_INLINE std::vector CSVStat::get_dtypes() const { - std::vector ret; - for (size_t i = 0; i < this->col_names->size(); i++) { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { ret.push_back(this->dtypes[i]); } return ret; } - CSV_INLINE void CSVStat::calc() { - /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->col_names->size(); i++) { - dtypes.push_back({}); - counts.push_back({}); - rolling_means.push_back(0); - rolling_vars.push_back(0); - mins.push_back(NAN); - maxes.push_back(NAN); - n.push_back(0); + CSV_INLINE void CSVStat::calc_chunk() { + /** Only create stats counters the first time **/ + if (dtypes.empty()) { + /** Go through all records and calculate specified statistics */ + for (size_t i = 0; i < this->get_col_names().size(); i++) { + dtypes.push_back({}); + counts.push_back({}); + rolling_means.push_back(0); + rolling_vars.push_back(0); + mins.push_back(NAN); + maxes.push_back(NAN); + n.push_back(0); + } } - std::vector pool; - // Start threads - for (size_t i = 0; i < this->col_names->size(); i++) + std::vector pool; + for (size_t i = 0; i < this->get_col_names().size(); i++) pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); // Block until done - for (auto& th: pool) + for (auto& th : pool) th.join(); this->records.clear(); } + CSV_INLINE void CSVStat::calc() { + constexpr size_t CALC_CHUNK_SIZE = 5000; + + for (auto& row : reader) { + this->records.push_back(std::move(row)); + + /** Chunk rows */ + if (this->records.size() == CALC_CHUNK_SIZE) { + calc_chunk(); + } + } + + if (!this->records.empty()) { + calc_chunk(); + } + } + CSV_INLINE void CSVStat::calc_worker(const size_t &i) { /** Worker thread for CSVStat::calc() which calculates statistics for one column. - * + * * @param[in] i Column index */ auto current_record = this->records.begin(); for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->n_cols) { + if (current_record->size() == this->get_col_names().size()) { auto current_field = (*current_record)[i]; // Optimization: Don't count() if there's too many distinct values in the first 1000 rows @@ -7598,7 +8383,7 @@ namespace csv { this->min_max(x_n, i); } } - else if (this->_format.get_variable_column_policy() == VariableColumnPolicy::THROW) { + else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); } @@ -7611,7 +8396,7 @@ namespace csv { * @param[in] record Data observation * @param[out] i The column index that should be updated */ - + auto type = data.type(); if (this->dtypes[i].find(type) != this->dtypes[i].end()) { @@ -7650,7 +8435,7 @@ namespace csv { this->mins[i] = x_n; if (std::isnan(this->maxes[i])) this->maxes[i] = x_n; - + if (x_n < this->mins[i]) this->mins[i] = x_n; else if (x_n > this->maxes[i]) @@ -7670,7 +8455,7 @@ namespace csv { long double delta2; current_n++; - + if (current_n == 1) { current_rolling_mean = x_n; } else { @@ -7717,20 +8502,32 @@ namespace csv { return csv_dtypes; } } +#include #include namespace csv { - /** Shorthand function for parsing an in-memory CSV string, - * a collection of CSVRow objects + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects * + * @par Example * @snippet tests/test_read_csv.cpp Parse Example */ CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - CSVReader parser(format); - parser.feed(in); - parser.end_feed(); - return parser; + std::stringstream stream(in.data()); + return CSVReader(stream, format); + } + + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); + + return parse(in, format); } /** Parse a RFC 4180 CSV string, returning a collection @@ -7744,6 +8541,11 @@ namespace csv { return parse(csv::string_view(in, n)); } + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); + } + /** * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise * @@ -7752,9 +8554,9 @@ namespace csv { * @param[in] format Format of the CSV file */ CSV_INLINE int get_col_pos( - const std::string filename, - const std::string col_name, - const CSVFormat format) { + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { CSVReader reader(filename, format); return reader.index_of(col_name); } @@ -7771,172 +8573,13 @@ namespace csv { filename, reader.get_col_names(), format.get_delim(), - reader.size(), - (int)reader.get_col_names().size() + reader.n_rows(), + reader.get_col_names().size() }; return info; } } -namespace csv { - CSV_INLINE void BasicCSVParser::parse(csv::string_view in, std::deque& records) { - using internals::ParseFlags; - - this->set_data_ptr(std::make_shared()); - this->data_ptr->col_names = this->col_names; - this->_records = &records; - - // Check for previous fragments - if ((this->current_row.data && this->current_row.size() > 0) || this->field_length > 0) { - // Make a separate data buffer for the fragment row - auto temp_str = this->current_row.data->data.substr(this->current_row.data_start); - - this->current_row.data = this->data_ptr; - this->current_row.data_start = 0; - this->current_row.row_length = 0; - this->current_row.field_bounds_index = 0; - - this->field_start = -1; - this->field_length = 0; - - auto& fragment_data = this->current_row.data; - fragment_data->data.reserve(temp_str.size() + in.size()); - fragment_data->data = temp_str; - fragment_data->data += in; - - in = csv::string_view(fragment_data->data); - } - else { - this->data_ptr->data.assign(in.data(), in.size()); - this->current_row = CSVRow(this->data_ptr); - } - - this->parse_loop(in); - } - - CSV_INLINE void BasicCSVParser::push_field() - { - // Push field - this->fields->push_back({ - this->field_start > 0 ? (unsigned int)this->field_start : 0, - this->field_length - }); - this->current_row.row_length++; - - if (this->field_has_double_quote) { - this->current_row.data->has_double_quotes.insert(this->data_ptr->fields.size() - 1); - this->field_has_double_quote = false; - } - - // Reset field state - this->field_start = -1; - this->field_length = 0; - } - - CONSTEXPR void BasicCSVParser::parse_field(csv::string_view in, size_t& i, const size_t& current_row_start, bool quote_escape) { - using internals::ParseFlags; - - // Trim off leading whitespace - while (i < in.size() && ws_flag(in[i])) i++; - - if (this->field_start < 0) { - this->field_start = (int)(i - current_row_start); - } - - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - if (quote_escape) { - while (i < in.size() && parse_flag(in[i]) != ParseFlags::QUOTE) i++; - } - else { - while (i < in.size() && parse_flag(in[i]) == ParseFlags::NOT_SPECIAL) i++; - } - - this->field_length = i - (this->field_start + current_row_start); - - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = i - 1; ws_flag(in[j]) && this->field_length > 0; j--) this->field_length--; - } - - CSV_INLINE void BasicCSVParser::parse_loop(csv::string_view in) - { - using internals::ParseFlags; - - // Parser state - size_t current_row_start = 0; - bool quote_escape = false; - - size_t in_size = in.size(); - for (size_t i = 0; i < in_size; ) { - if (quote_escape) { - // TODO: Clean up these conditions - if (parse_flag(in[i]) == ParseFlags::QUOTE) { - if (i + 1 == in.size() || (i + 1 < in.size() && parse_flag(in[i + 1]) >= ParseFlags::DELIMITER)) { - quote_escape = false; - i++; - continue; - } - - // Case: Escaped quote - this->field_length++; - i++; - - if (i < in.size() && parse_flag(in[i]) == ParseFlags::QUOTE) { - i++; - this->field_length++; - this->field_has_double_quote = true; - } - - continue; - } - - this->parse_field(in, i, current_row_start, quote_escape); - } - else { - switch (parse_flag(in[i])) { - case ParseFlags::DELIMITER: - this->push_field(); - i++; - break; - - case ParseFlags::NEWLINE: - i++; - - // Catches CRLF (or LFLF) - if (i < in.size() && parse_flag(in[i]) == ParseFlags::NEWLINE) i++; - - // End of record -> Write record - this->push_field(); - this->push_row(*this->_records); - this->current_row = CSVRow(this->data_ptr); - this->current_row.data_start = i; - this->current_row.field_bounds_index = this->data_ptr->fields.size(); - current_row_start = i; - break; - - case ParseFlags::NOT_SPECIAL: - this->parse_field(in, i, current_row_start, quote_escape); - break; - default: // Quote - if (this->field_length == 0) { - quote_escape = true; - i++; - break; - } - - // Unescaped quote - this->field_length++; - i++; - - break; - } - } - } - } -} - #endif diff --git a/include/indicators.hpp b/external/indicators.hpp similarity index 99% rename from include/indicators.hpp rename to external/indicators.hpp index 3832d665..857e25b7 100644 --- a/include/indicators.hpp +++ b/external/indicators.hpp @@ -1666,7 +1666,7 @@ static inline std::wstring utf8_decode(const std::string& s) { setlocale(LC_ALL, curLocale.c_str()); return result; } -#else +#else static inline std::wstring utf8_decode(const std::string& s) { auto r = setlocale(LC_ALL, ""); std::string curLocale; diff --git a/include/cartogram_info.hpp b/include/cartogram_info.hpp index 6fe66bd6..ccb9389c 100644 --- a/include/cartogram_info.hpp +++ b/include/cartogram_info.hpp @@ -39,7 +39,8 @@ class CartogramInfo void replace_missing_and_zero_target_areas(); std::string set_map_name(const std::string &); void shift_insets_to_target_position(); - void write_geojson(const std::string &, const std::string &, bool); + void write_csv(const std::string &csv_file_name); + void write_geojson(const std::string &, const std::string &, bool = false); }; #endif // CARTOGRAM_INFO_HPP_ diff --git a/include/parse_arguments.hpp b/include/parse_arguments.hpp index 305d7976..932cc1b6 100644 --- a/include/parse_arguments.hpp +++ b/include/parse_arguments.hpp @@ -25,6 +25,7 @@ argparse::ArgumentParser parsed_arguments( bool &remove_tiny_polygons, double &minimum_polygon_area, bool &plot_quadtree, - bool &rays); + bool &rays, + bool &output_preprocessed); #endif // PARSE_ARGUMENTS_HPP_ diff --git a/include/progress_tracker.hpp b/include/progress_tracker.hpp index 4de72f62..40cd4732 100644 --- a/include/progress_tracker.hpp +++ b/include/progress_tracker.hpp @@ -26,6 +26,7 @@ class ProgressTracker private: double total_geo_divs_; // Total number of GeoDivs to monitor progress double progress_; // Progress measured on a scale from 0 (start) to 1 (end) + double max_progress_; // Maximum progress value ever reached indicators::ProgressBar bar_; }; diff --git a/src/cartogram_info/cartogram_info.cpp b/src/cartogram_info/cartogram_info.cpp index 48ffa5a8..5d9630d3 100644 --- a/src/cartogram_info/cartogram_info.cpp +++ b/src/cartogram_info/cartogram_info.cpp @@ -1,5 +1,6 @@ #include "cartogram_info.hpp" #include "constants.hpp" +#include "csv.hpp" #include #include @@ -204,3 +205,36 @@ std::string CartogramInfo::set_map_name(const std::string &map_name) } return map_name_; } + +void CartogramInfo::write_csv(const std::string &csv_file_name) { + // Write a csv file with the current target areas + std::ofstream out_file_csv; + out_file_csv.open(csv_file_name + ".csv"); + if (!out_file_csv) { + std::cerr + << "ERROR writing CSV: failed to open " << csv_file_name << ".csv" + << std::endl; + } + + // Each vector of strings will represent one row, starting with column names + std::vector > csv_rows(1); + + csv_rows[0].push_back(id_header_); + csv_rows[0].push_back("Target Area"); + + // Fill up the rows with the IDs and target areas + for (const auto &[id, inset_pos] : gd_to_inset_) { + const auto &inset_state = inset_states_.at(inset_pos); + const auto target_area = inset_state.target_area_at(id); + csv_rows.push_back({id, std::to_string(target_area)}); + } + + // Write to CSV object + auto writer = csv::make_csv_writer(out_file_csv); + for (const auto &row : csv_rows) { + writer << row; + } + + // Close out_file and exit + out_file_csv.close(); +} \ No newline at end of file diff --git a/src/cartogram_info/write_geojson.cpp b/src/cartogram_info/write_geojson.cpp index 7a3f17f2..a0174239 100644 --- a/src/cartogram_info/write_geojson.cpp +++ b/src/cartogram_info/write_geojson.cpp @@ -177,7 +177,7 @@ void CartogramInfo::write_geojson( const std::string &new_geo_file_name, const bool output_to_stdout) { - std::cerr << "Writing " << new_geo_file_name << std::endl; + std::cerr << "Writing " << new_geo_file_name << ".geojson" << std::endl; std::ifstream old_file(old_geo_file_name); nlohmann::json old_json; old_file >> old_json; @@ -193,7 +193,7 @@ void CartogramInfo::write_geojson( combined_json["Original"] = new_json_original; std::cout << combined_json << std::endl; } else { - std::ofstream o(new_geo_file_name); + std::ofstream o(new_geo_file_name + ".geojson"); o << new_json << std::endl; } } diff --git a/src/inset_state/inset_state.cpp b/src/inset_state/inset_state.cpp index cd58a931..c12dbc32 100644 --- a/src/inset_state/inset_state.cpp +++ b/src/inset_state/inset_state.cpp @@ -182,7 +182,7 @@ bool InsetState::insert_constraint_safely(const Point &p1, const Point &p2) proj_qd_.dt.insert_constraint(p1, p2); return true; } catch (const std::exception &e) { - std::cout << "WARNING DIAGONAL: Could not insert constraint between " << p1 + std::cerr << "WARNING DIAGONAL: Could not insert constraint between " << p1 << " and " << p2 << std::endl; std::cerr << e.what() << std::endl; // Add to the list of failed constraints @@ -419,12 +419,12 @@ void InsetState::create_and_store_quadtree_cell_corners() double rho_max = -1e9; // get the minimum rho_init of the bbox of the node - for (unsigned int i = bbox.xmin(); i < bbox.xmax(); ++i) { - for (unsigned int j = bbox.ymin(); j < bbox.ymax(); ++j) { - if (i >= this->lx() || j >= this->ly()) { + for (int i = bbox.xmin(); i < bbox.xmax(); ++i) { + for (int j = bbox.ymin(); j < bbox.ymax(); ++j) { + if (i < 0 || j < 0) { continue; } - if (i < 0 || j < 0) { + if (i >= (int) this->lx() || j >= (int) this->ly()) { continue; } rho_min = std::min(rho_min, this->ref_to_rho_init()(i, j)); diff --git a/src/main.cpp b/src/main.cpp index eba784ed..ca32acff 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -32,7 +32,8 @@ int main(const int argc, const char *argv[]) // Other boolean values that are needed to parse the command line arguments bool make_csv, output_equal_area, output_to_stdout, plot_density, plot_grid, - plot_intersections, plot_polygons, plot_quadtree, remove_tiny_polygons; + plot_intersections, plot_polygons, plot_quadtree, remove_tiny_polygons, + output_preprocessed; // If the proportion of the polygon area is smaller than // min_polygon_area * total area, then remove polygon @@ -61,7 +62,8 @@ int main(const int argc, const char *argv[]) remove_tiny_polygons, min_polygon_area, plot_quadtree, - rays); + rays, + output_preprocessed); // Initialize cart_info. It contains all the information about the cartogram // that needs to be handled by functions called from main(). @@ -121,21 +123,6 @@ int main(const int argc, const char *argv[]) return EXIT_FAILURE; } - if (simplify) { - std::cerr << "Start of initial simplification of " << inset_pos - << std::endl; - time_tracker.start("Simplification"); - - // Simplification reduces the number of points used to represent the - // GeoDivs in the inset, thereby reducing output file sizes and - // run-times - inset_state.simplify(target_points_per_inset); - - // Update time - time_tracker.stop("Simplification"); - } - std::cerr << "End of initial simplification of " << inset_pos << std::endl; - // End of inset time time_tracker.stop("Inset " + inset_pos); } @@ -157,10 +144,7 @@ int main(const int argc, const char *argv[]) cart_info.shift_insets_to_target_position(); // Output to GeoJSON - cart_info.write_geojson( - geo_file_name, - map_name + "_equal_area.geojson", - output_to_stdout); + cart_info.write_geojson(geo_file_name, map_name + "_equal_area"); return EXIT_SUCCESS; } @@ -185,18 +169,39 @@ int main(const int argc, const char *argv[]) // Rescale map to fit into a rectangular box [0, lx] * [0, ly] inset_state.rescale_map(long_grid_side_length, cart_info.is_world_map()); - // Output rescaled GeoJSON - cart_info.write_geojson( - geo_file_name, - map_name + "_input.geojson", - output_to_stdout); - if (output_to_stdout) { // Store original coordinates inset_state.store_original_geo_divs(); } + if (simplify) { + std::cerr << "Start of initial simplification of " << inset_pos + << std::endl; + time_tracker.start("Simplification"); + + // Simplification reduces the number of points used to represent the + // GeoDivs in the inset, thereby reducing output file sizes and + // run-times + inset_state.simplify(target_points_per_inset); + + // Update time + time_tracker.stop("Simplification"); + } + std::cerr << "End of initial simplification of " << inset_pos << std::endl; + + if (output_preprocessed) { + // Output rescaled GeoJSON + cart_info.write_geojson( + geo_file_name, + // processed = simplified + rescaled + // and potentially projected + small polygons removed + map_name + "_input_processed"); + + // Output preprocessed CSV file + cart_info.write_csv(map_name + "_input_processed"); + } + // Set up Fourier transforms const unsigned int lx = inset_state.lx(); const unsigned int ly = inset_state.ly(); @@ -383,14 +388,6 @@ int main(const int argc, const char *argv[]) plot_grid); } - if (world) { - cart_info.write_geojson( - geo_file_name, - map_name + "_cartogram_in_smyth_projection.geojson", - output_to_stdout); - inset_state.revert_smyth_craster_projection(); - } - if (output_to_stdout and !qtdt_method) { inset_state.fill_grid_diagonals(true); inset_state.project_with_cum_proj(); @@ -422,7 +419,7 @@ int main(const int argc, const char *argv[]) // Output to GeoJSON cart_info.write_geojson( geo_file_name, - map_name + "_cartogram.geojson", + map_name + "_cartogram", output_to_stdout); // Stop of main function time diff --git a/src/misc/parse_arguments.cpp b/src/misc/parse_arguments.cpp index c5854a7f..1da6ef8f 100644 --- a/src/misc/parse_arguments.cpp +++ b/src/misc/parse_arguments.cpp @@ -22,11 +22,12 @@ argparse::ArgumentParser parsed_arguments( bool &remove_tiny_polygons, double &minimum_polygon_area, bool &plot_quadtree, - bool &rays) + bool &rays, + bool &output_preprocessed) { // Create parser for arguments using argparse. // From https://github.com/p-ranav/argparse - argparse::ArgumentParser arguments("./cartogram", "2.0"); + argparse::ArgumentParser arguments("./cartogram", RELEASE_TAG); // Positional argument accepting geometry file (GeoJSON, JSON) as input arguments.add_argument("geometry_file") @@ -115,6 +116,10 @@ argparse::ArgumentParser parsed_arguments( .help("Boolean: Use old ray shooting method to fill density") .default_value(false) .implicit_value(true); + arguments.add_argument("--output_preprocessed") + .help("Boolean: output input GeoJSON and CSV after preprocessing") + .default_value(false) + .implicit_value(true); // Arguments of column names in provided visual variables file (CSV) std::string pre = "String: Column name for "; @@ -174,6 +179,7 @@ argparse::ArgumentParser parsed_arguments( make_csv = arguments.get("-M"); output_equal_area = arguments.get("-E"); output_to_stdout = arguments.get("-O"); + output_preprocessed = arguments.get("--output_preprocessed"); plot_density = arguments.get("-d"); plot_grid = arguments.get("-g"); plot_intersections = arguments.get("-i"); diff --git a/src/misc/progress_tracker.cpp b/src/misc/progress_tracker.cpp index 70052186..c574d3e7 100644 --- a/src/misc/progress_tracker.cpp +++ b/src/misc/progress_tracker.cpp @@ -40,6 +40,25 @@ void ProgressTracker::print_progress_mid_integration( // finished insets const double inset_max_frac = inset_state.n_geo_divs() / total_geo_divs_; double progress = progress_ + (inset_max_frac / n_predicted_integrations); + + // Change how much progress increases by, so it never reaches 100 here + double remaining_progress = 1.0 - max_progress_; + double dynamic_increment = remaining_progress * 0.1; + + // Leave buffer at end so that we don't reach 100% prematurely + progress = std::min(progress, 0.75); + + // Our assumption above causes the progress bar to start at 36%. + // Thus, we temper it down for the first few integrations. + if (inset_state.n_finished_integrations() < 4) { + progress = std::min(progress, max_progress_); + } + + // Increase max_progress by dynamic increment that gets smaller + // as we get closer to 100%. + progress = std::max(progress, max_progress_ + dynamic_increment); + + max_progress_ = progress; print_progress(progress); print_progress_bar(progress); } @@ -49,6 +68,7 @@ void ProgressTracker::print_progress_mid_integration( void ProgressTracker::update_and_print_progress_end_integration( const InsetState &inset_state) { + max_progress_ = 0; const double inset_max_frac = inset_state.n_geo_divs() / total_geo_divs_; progress_ += inset_max_frac; print_progress(progress_);