From f43907deb84142e20a1afdeef1dc307e5724eba1 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:33:53 +0000 Subject: [PATCH] ensure where clause isnt used if no range is supplied --- digital_land/package/dataset_parquet.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/digital_land/package/dataset_parquet.py b/digital_land/package/dataset_parquet.py index e9c5c489..73bef18c 100644 --- a/digital_land/package/dataset_parquet.py +++ b/digital_land/package/dataset_parquet.py @@ -224,11 +224,17 @@ def load_entities_range( # Do this to match with later field names. entity_fields = [e.replace("-", "_") for e in entity_fields] # input_paths_str = f"{self.cache_dir}/fact{self.suffix}" + if entity_range is not None: + entity_where_clause = ( + f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}" + ) + else: + entity_where_clause = "" query = f""" SELECT DISTINCT REPLACE(field,'-','_') FROM parquet_scan('{transformed_parquet_dir}/*.parquet') - WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]} + {entity_where_clause} """ # distinct_fields - list of fields in the field in fact @@ -299,12 +305,7 @@ def load_entities_range( # query to create the file # craft a where clause to limit entities in quetion, this chunking helps solve memory issues - if entity_range is not None: - entity_where_clause = ( - f"WHERE entity >= {entity_range[0]} AND entity < {entity_range[1]}" - ) - else: - entity_where_clause = "" + query = f""" SELECT {fields_str}{optional_org_str} FROM ( SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date