From 6ef78112a73517de55eeae08303e55f1669564ff Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Tue, 3 Dec 2024 14:54:46 +0000 Subject: [PATCH] Setting max_size to a hard_limit if failed once (#297) * Setting max_size to a hard_limit if failed once * Add record of error * Commented out unneeded lines * Edits to max_size --------- Co-authored-by: alexiglaser --- digital_land/package/datasetparquet.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/digital_land/package/datasetparquet.py b/digital_land/package/datasetparquet.py index 9c8a8a64..724e92da 100644 --- a/digital_land/package/datasetparquet.py +++ b/digital_land/package/datasetparquet.py @@ -2,6 +2,7 @@ import logging import duckdb from .package import Package +import resource logger = logging.getLogger(__name__) @@ -64,15 +65,15 @@ def create_temp_table(self, input_paths): input_paths_str = ", ".join([f"'{path}'" for path in input_paths]) - self.conn.execute("DROP TABLE IF EXISTS temp_table") # Initial max_line_size and increment step max_size = 40000000 - increment_step = 20000000 + # increment_step = 20000000 # max_limit = 200000000 # Maximum allowable line size to attempt - increment = False + # increment = False while True: try: + self.conn.execute("DROP TABLE IF EXISTS temp_table") query = f""" CREATE TEMPORARY TABLE temp_table AS SELECT * @@ -85,14 +86,19 @@ def create_temp_table(self, input_paths): ) """ self.conn.execute(query) - if increment: - logging.info(f"Ended up needing a value of max_size = {max_size}") break except duckdb.Error as e: # Catch specific DuckDB error if "Value with unterminated quote" in str(e): - increment = True - max_size += increment_step + hard_limit = int(resource.getrlimit(resource.RLIMIT_AS)[1]) + if max_size < hard_limit / 3: + logging.info( + f"Initial max_size did not work, setting it to {hard_limit / 2}" + ) + max_size = hard_limit / 2 + else: + raise else: + logging.info(f"Failed to read in when max_size = {max_size}") raise def load_facts(self):