From 5c9e11069121279f28d704f84a25fe1629f57d28 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 10 May 2024 22:52:41 +0800 Subject: [PATCH 01/33] WIP: Add geometry logical type --- src/main/thrift/parquet.thrift | 98 +++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 83457fe29..0cc7745a3 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -277,8 +277,11 @@ struct Statistics { * may set min_value="B", max_value="C". Such more compact values must still be * valid values within the column's logical type. * - * Values are encoded using PLAIN encoding, except that variable-length byte - * arrays do not include a length prefix. + * Values are encoded using PLAIN encoding, except that: + * 1) variable-length byte arrays do not include a length prefix. + * 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair + * to store the bounding box for the column. Please refer to the definition + * of BoundingBoxOrder for detail. */ 5: optional binary max_value; 6: optional binary min_value; @@ -380,6 +383,69 @@ struct JsonType { struct BsonType { } +/** + * A geometry can be any of the following subtypes. + * The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium) + * SFA (Simple Feature Access) Part 1- Common Architecture. + */ +enum GeometrySubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRY_COLLECTION = 6; +} + +/** + * Interpretation for edges, i.e. whether the edge between points + * represent a straight cartesian line or the shortest line on the sphere + */ +enum Edges { + PLANAR = 0; + // SPHERICAL = 1; // not supported yet +} + +/** + * Well-Known Binary. This is a well-known and popular binary representation regulated + * by the Open Geospatial Consortium (OGC). + */ +struct WKB {} +/** + * Encoding for geospatial data. + */ +union GeospatialEncoding { + 1: WKB WKB +} + +/** + * Geometry logical type annotation + * + * Allowed for physical types: BINARY (added in 2.11.0) + */ +struct GeometryType { + /** + * The subtype of the geometry. + * If set, all values in the column must be of the same subtype. + * If not set, the column may contain values of any subtype. + */ + 1: optional GeometrySubType subtype; + /** + * The dimension of the geometry. + * For now only 2D geometry is supported and the value must be 2 if set. + */ + 2: optional byte dimension; + /** + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth. + * For now only OGC:CRS84 is supported. + */ + 3: optional string crs; + 4: required Edges edges; + 5: required GeospatialEncoding encoding; +} + /** * LogicalType annotations to replace ConvertedType. * @@ -410,6 +476,7 @@ union LogicalType { 13: BsonType BSON // use ConvertedType BSON 14: UUIDType UUID // no compatible ConvertedType 15: Float16Type FLOAT16 // no compatible ConvertedType + 16: GeometryType GEOMETRY // no compatible ConvertedType } /** @@ -942,6 +1009,8 @@ struct RowGroup { /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} +/** Empty struct to signal the order of GEOMETRY logical type */ +struct BoundingBoxOrder {} /** * Union to specify the order used for the min_value and max_value fields for a @@ -951,6 +1020,8 @@ struct TypeDefinedOrder {} * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). + * * BoundingBoxOrder - the column uses the order to build bounding box + * (if the logical type is GEOMETRY). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. @@ -980,6 +1051,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined + * GEOMETRY - undefined, as geometry objects cannot be compared directly * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -1008,6 +1080,23 @@ union ColumnOrder { * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; + + /** + * The order only applies to GEOMETRY logical type. + * + * Please note that geometry objects cannot be compared directly. This order aims to + * provide an approach to build a bounding box for geometry objects in the same page + * or column chunk. + * + * In this order, all 2D geometries are regarded as a collection of coordinate (x, y). + * For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON + * might have three or more coordinates. A bounding box is the combination of x_min, + * x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty, + * min_value field in the Statistics/ColumnIndex is encoded as the concatenation of + * PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is + * encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values. + */ + 2: BoundingBoxOrder BBOX_ORDER; } struct PageLocation { @@ -1079,6 +1168,9 @@ struct ColumnIndex { * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. + * + * For GEOMETRY logical type, these values are the bounding box of the column. + * Please refer to the definition of BoundingBoxOrder for detail. */ 2: required list min_values 3: required list max_values @@ -1088,6 +1180,8 @@ struct ColumnIndex { * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. + * + * For GEOMETRY type, UNORDERED is used at all times. */ 4: required BoundaryOrder boundary_order From 5ef28cd5de12e6d6de453e7c6855da62c63a21b8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 25 May 2024 23:14:15 +0800 Subject: [PATCH 02/33] address various comments --- src/main/thrift/parquet.thrift | 174 ++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 81 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 0cc7745a3..1ca2f5952 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,6 +237,38 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Bounding box of geometries in the representation of min/max value pair of + * coordinates from each axis. Values of Z and M are omitted for 2D geometries. + */ +struct BoundingBox { + 1: optional double x_min; + 2: optional double x_max; + 3: optional double y_min; + 4: optional double y_max; + 5: optional double z_min; + 6: optional double z_max; + 7: optional double m_min; + 8: optional double m_max; +} + +/** Statistics specific to GEOMETRY logical type */ +struct GeometryStatistics { + /** Bounding box of geometries */ + 1: optional BoundingBox bbox; + /** Covering of geometries as a list of Google S2 cell ids */ + 2: list s2_cell_ids; + /** Covering of geometries as a list of Uber H3 indices */ + 3: list h3_indices; + /** + * The geometry types of all geometries, or an empty array if they are not + * known. It follows the same rule of `geometry_types` column metadata of + * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", + * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + */ + 4: list geometry_types; +} + /** * Statistics per row group and per page * All fields are optional. @@ -277,11 +309,8 @@ struct Statistics { * may set min_value="B", max_value="C". Such more compact values must still be * valid values within the column's logical type. * - * Values are encoded using PLAIN encoding, except that: - * 1) variable-length byte arrays do not include a length prefix. - * 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair - * to store the bounding box for the column. Please refer to the definition - * of BoundingBoxOrder for detail. + * Values are encoded using PLAIN encoding, except that variable-length byte + * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; @@ -289,6 +318,9 @@ struct Statistics { 7: optional bool is_max_value_exact; /** If true, min_value is the actual minimum value for a column */ 8: optional bool is_min_value_exact; + + /** statistics specific to geometry logical type */ + 9: optional GeometryStatistics geometry_stats; } /** Empty structs to use as logical type annotations */ @@ -384,66 +416,69 @@ struct BsonType { } /** - * A geometry can be any of the following subtypes. - * The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium) - * SFA (Simple Feature Access) Part 1- Common Architecture. + * Phyiscal type and encoding for the geometry type. */ -enum GeometrySubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRY_COLLECTION = 6; -} - -/** - * Interpretation for edges, i.e. whether the edge between points - * represent a straight cartesian line or the shortest line on the sphere - */ -enum Edges { - PLANAR = 0; - // SPHERICAL = 1; // not supported yet -} +enum GeometryEncoding { + /** + * Allowed for phyiscal type: BYTE_ARRAY. + * + * Well-known binary (WKB) representations of geometries. It supports 2D or + * 3D geometries of the standard geometry types (Point, LineString, Polygon, + * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This + * is the preferred option for maximum portability. + * + * This encoding enables GeometryStatistics to be set in the column chunk + * and page index. + */ + WKB = 0; -/** - * Well-Known Binary. This is a well-known and popular binary representation regulated - * by the Open Geospatial Consortium (OGC). - */ -struct WKB {} -/** - * Encoding for geospatial data. - */ -union GeospatialEncoding { - 1: WKB WKB + /** + * Encodings from POINT to MULTIPOLYGON below are specialized for single + * geometry type and inspired by GeoArrow (https://geoarrow.org/format.html) + * native encodings. It uses the separated (struct) representation of + * coordinates for single-geometry type encodings because this encoding + * results in useful column statistics when row groups and/or files contain + * related features. + * + * WARNING: GeometryStatistics cannot be enabled for these encodings because + * only leaf columns can have column statistics and page index. + * + * The actual coordinates of the geometries MUST be stored as native numbers, + * i.e. using the DOUBLE type in a (repeated) group of fields (exact + * repetition depending on the geometry type). + * + * For the POINT encoding, this results in a struct of two fields for x and y + * coordinates (in case of 2D geometries): + * optional group geometry { + * required double x; + * required double y; + * } + * + * For more detail, please refer to link below: + * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding + */ + POINT = 1; + LINESTRING = 2; + POLYGON = 3; + MULTIPOINT = 4; + MULTILINESTRING = 5; + MULTIPOLYGON = 6; } /** - * Geometry logical type annotation - * - * Allowed for physical types: BINARY (added in 2.11.0) + * Geometry logical type annotation (added in 2.11.0) */ struct GeometryType { /** - * The subtype of the geometry. - * If set, all values in the column must be of the same subtype. - * If not set, the column may contain values of any subtype. + * Phyiscal type and encoding for the geometry type. Please refer to the + * definition of GeometryEncoding for more detail. */ - 1: optional GeometrySubType subtype; + 1: required GeometryEncoding encoding; /** - * The dimension of the geometry. - * For now only 2D geometry is supported and the value must be 2 if set. + * Additional informative metadata. + * It can be used by GeoParquet to offload some of the column metadata. */ - 2: optional byte dimension; - /** - * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth. - * For now only OGC:CRS84 is supported. - */ - 3: optional string crs; - 4: required Edges edges; - 5: required GeospatialEncoding encoding; + 2: optional string metadata; } /** @@ -1009,8 +1044,6 @@ struct RowGroup { /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} -/** Empty struct to signal the order of GEOMETRY logical type */ -struct BoundingBoxOrder {} /** * Union to specify the order used for the min_value and max_value fields for a @@ -1020,8 +1053,6 @@ struct BoundingBoxOrder {} * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). - * * BoundingBoxOrder - the column uses the order to build bounding box - * (if the logical type is GEOMETRY). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. @@ -1051,7 +1082,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined - * GEOMETRY - undefined, as geometry objects cannot be compared directly + * GEOMETRY - undefined, use GeometryStatistics instead. * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -1080,23 +1111,6 @@ union ColumnOrder { * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; - - /** - * The order only applies to GEOMETRY logical type. - * - * Please note that geometry objects cannot be compared directly. This order aims to - * provide an approach to build a bounding box for geometry objects in the same page - * or column chunk. - * - * In this order, all 2D geometries are regarded as a collection of coordinate (x, y). - * For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON - * might have three or more coordinates. A bounding box is the combination of x_min, - * x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty, - * min_value field in the Statistics/ColumnIndex is encoded as the concatenation of - * PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is - * encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values. - */ - 2: BoundingBoxOrder BBOX_ORDER; } struct PageLocation { @@ -1168,9 +1182,6 @@ struct ColumnIndex { * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. - * - * For GEOMETRY logical type, these values are the bounding box of the column. - * Please refer to the definition of BoundingBoxOrder for detail. */ 2: required list min_values 3: required list max_values @@ -1180,8 +1191,6 @@ struct ColumnIndex { * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. - * - * For GEOMETRY type, UNORDERED is used at all times. */ 4: required BoundaryOrder boundary_order @@ -1214,6 +1223,9 @@ struct ColumnIndex { * Same as repetition_level_histograms except for definitions levels. **/ 7: optional list definition_level_histograms; + + /** A list containing statistics of GEOMETRY logical type for each page */ + 8: optional list geometry_stats; } struct AesGcmV1 { From ecd8cc28a3d92627c3a4506911e2f17d5eb53f18 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 27 May 2024 09:32:37 +0800 Subject: [PATCH 03/33] add file level geo stats --- src/main/thrift/parquet.thrift | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 1ca2f5952..3907a62d6 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -479,6 +479,8 @@ struct GeometryType { * It can be used by GeoParquet to offload some of the column metadata. */ 2: optional string metadata; + /** File-level statistics for geometries */ + 3: optional GeometryStatistics statistics; } /** From d81dacb0bde64170044cb34a865d10e9cfb34d19 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 31 May 2024 11:30:15 +0800 Subject: [PATCH 04/33] address feedback: - remove file-level geo stats - add custom wkb-encoded geometry stats - comment out controversial items --- src/main/thrift/parquet.thrift | 126 ++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 27 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 3907a62d6..1c91910be 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,36 +237,97 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge + * between points represent a straight cartesian line or the shortest line on + * the sphere. + */ +enum Edges { + PLANAR = 0; + SPHERICAL = 1; +} + +/** + * A custom WKB-encoded geometry data to be used in geometry statistics. + * The geometry may be a polygon to encode an s2 or h3 covering to provide + * vendor-agnostic coverings, or an evelope of geometries when a bounding + * box cannot be built (e.g. a geometry has spherical edges, or if an edge + * of geographic coordinates crosses the antimeridian). + */ +struct Geometry { + /** Bytes of a WKB-encoded geometry */ + 1: required binary geometry; + /** + * Edges of the geometry if it is a polygon. It may be different to the + * edges attribute from the GEOMETRY logical type. + */ + 2: optional Edges edges; +} + /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. */ struct BoundingBox { - 1: optional double x_min; - 2: optional double x_max; - 3: optional double y_min; - 4: optional double y_max; - 5: optional double z_min; - 6: optional double z_max; - 7: optional double m_min; - 8: optional double m_max; + 1: required double xmin; + 2: required double xmax; + 3: required double ymin; + 4: required double ymax; + 5: optional double zmin; + 6: optional double zmax; + 7: optional double mmin; + 8: optional double mmax; } -/** Statistics specific to GEOMETRY logical type */ -struct GeometryStatistics { - /** Bounding box of geometries */ - 1: optional BoundingBox bbox; +union Envelope { + 1: BoundingBox bbox // A bounding box of geometries if it can be built. + 2: Geometry covering // A covering polygon of geometries if bbox is unavailable. +} + +/** S2 spatial index: http://s2geometry.io/ */ +struct S2Index { + /** Level of S2 cell ids. valid range is [0, 30] */ + 1: required i32 level; /** Covering of geometries as a list of Google S2 cell ids */ - 2: list s2_cell_ids; + 2: required list cell_ids; +} + +/** H3 spatial index: https://h3geo.org */ +struct H3Index { + /** Precision of H3 cell ids. valid range is [0, 15] */ + 1: required i32 precision; /** Covering of geometries as a list of Uber H3 indices */ - 3: list h3_indices; + 2: required list cell_ids; +} + +/** Statistics specific to GEOMETRY logical type */ +struct GeometryStatistics { + /** Envelope of geometries */ + 1: optional Envelope envelope; + /** * The geometry types of all geometries, or an empty array if they are not * known. It follows the same rule of `geometry_types` column metadata of * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + * + * In addition, the following rules are used: + * - In case of 3D geometries, a `" Z"` suffix gets added (e.g. `["Point Z"]`). + * - A list of multiple values indicates that multiple geometry types are + * present (e.g. `["Polygon", "MultiPolygon"]`). + * - An empty array explicitly signals that the geometry types are not known. + * - The geometry types in the list must be unique (e.g. `["Point", "Point"]` + * is not valid). + * + * Please refer to link below for more detail: + * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 4: list geometry_types; + 2: optional list geometry_types; + + // S2 and H3 are controversial from the discussion. Now they are commented + // out to show a possible approach for future extension. + // 3: optional S2Index s2; + // 4: optional H3Index h3; } /** @@ -440,9 +501,6 @@ enum GeometryEncoding { * results in useful column statistics when row groups and/or files contain * related features. * - * WARNING: GeometryStatistics cannot be enabled for these encodings because - * only leaf columns can have column statistics and page index. - * * The actual coordinates of the geometries MUST be stored as native numbers, * i.e. using the DOUBLE type in a (repeated) group of fields (exact * repetition depending on the geometry type). @@ -456,13 +514,20 @@ enum GeometryEncoding { * * For more detail, please refer to link below: * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding + * + * WARNING: GeometryStatistics cannot be enabled for these encodings because + * only leaf columns can have column statistics and page index. In this case, + * the statistics for the leaf columns contain equivalent information to the + * bounding box. */ - POINT = 1; - LINESTRING = 2; - POLYGON = 3; - MULTIPOINT = 4; - MULTILINESTRING = 5; - MULTIPOLYGON = 6; + // Native encodings are controversial from the discussion. Now they are commented + // out to show a possible approach for future extension. + // POINT = 1; + // LINESTRING = 2; + // POLYGON = 3; + // MULTIPOINT = 4; + // MULTILINESTRING = 5; + // MULTIPOLYGON = 6; } /** @@ -474,13 +539,20 @@ struct GeometryType { * definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; + /** + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth, e.g. OGC:CRS84 + */ + 2: optional string crs; + /** + * Edges of polygon. + */ + 3: optional Edges edges; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 2: optional string metadata; - /** File-level statistics for geometries */ - 3: optional GeometryStatistics statistics; + 4: optional string metadata; } /** From 80f40513499c9bcc019e1e22a92e7066c18713e2 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 13 Jun 2024 14:54:42 +0800 Subject: [PATCH 05/33] change naming and remove controversial items --- src/main/thrift/parquet.thrift | 67 +++------------------------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 1c91910be..ae8bd1f37 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -279,31 +279,15 @@ struct BoundingBox { 8: optional double mmax; } -union Envelope { - 1: BoundingBox bbox // A bounding box of geometries if it can be built. - 2: Geometry covering // A covering polygon of geometries if bbox is unavailable. -} - -/** S2 spatial index: http://s2geometry.io/ */ -struct S2Index { - /** Level of S2 cell ids. valid range is [0, 30] */ - 1: required i32 level; - /** Covering of geometries as a list of Google S2 cell ids */ - 2: required list cell_ids; -} - -/** H3 spatial index: https://h3geo.org */ -struct H3Index { - /** Precision of H3 cell ids. valid range is [0, 15] */ - 1: required i32 precision; - /** Covering of geometries as a list of Uber H3 indices */ - 2: required list cell_ids; +struct Covering { + optional BoundingBox bbox // A bounding box of geometries if it can be built. + optional Geometry covering // A covering polygon of geometries if bbox is unavailable. } /** Statistics specific to GEOMETRY logical type */ struct GeometryStatistics { - /** Envelope of geometries */ - 1: optional Envelope envelope; + /** Covering of geometries */ + 1: optional Covering covering; /** * The geometry types of all geometries, or an empty array if they are not @@ -323,11 +307,6 @@ struct GeometryStatistics { * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ 2: optional list geometry_types; - - // S2 and H3 are controversial from the discussion. Now they are commented - // out to show a possible approach for future extension. - // 3: optional S2Index s2; - // 4: optional H3Index h3; } /** @@ -493,41 +472,7 @@ enum GeometryEncoding { */ WKB = 0; - /** - * Encodings from POINT to MULTIPOLYGON below are specialized for single - * geometry type and inspired by GeoArrow (https://geoarrow.org/format.html) - * native encodings. It uses the separated (struct) representation of - * coordinates for single-geometry type encodings because this encoding - * results in useful column statistics when row groups and/or files contain - * related features. - * - * The actual coordinates of the geometries MUST be stored as native numbers, - * i.e. using the DOUBLE type in a (repeated) group of fields (exact - * repetition depending on the geometry type). - * - * For the POINT encoding, this results in a struct of two fields for x and y - * coordinates (in case of 2D geometries): - * optional group geometry { - * required double x; - * required double y; - * } - * - * For more detail, please refer to link below: - * https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#encoding - * - * WARNING: GeometryStatistics cannot be enabled for these encodings because - * only leaf columns can have column statistics and page index. In this case, - * the statistics for the leaf columns contain equivalent information to the - * bounding box. - */ - // Native encodings are controversial from the discussion. Now they are commented - // out to show a possible approach for future extension. - // POINT = 1; - // LINESTRING = 2; - // POLYGON = 3; - // MULTIPOINT = 4; - // MULTILINESTRING = 5; - // MULTIPOLYGON = 6; + // TODO: add native encoding from GeoParquet/GeoArrow } /** From 0db6d9fed5410388f04f010ef11039ec0562f56d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Jun 2024 15:30:17 +0800 Subject: [PATCH 06/33] address feedback --- src/main/thrift/parquet.thrift | 39 +++++++++++++++------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index ae8bd1f37..11ed38f1a 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -240,7 +240,7 @@ struct SizeStatistics { /** * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge * between points represent a straight cartesian line or the shortest line on - * the sphere. + * the sphere. Please note that it only applies to polygons. */ enum Edges { PLANAR = 0; @@ -248,20 +248,17 @@ enum Edges { } /** - * A custom WKB-encoded geometry data to be used in geometry statistics. - * The geometry may be a polygon to encode an s2 or h3 covering to provide - * vendor-agnostic coverings, or an evelope of geometries when a bounding - * box cannot be built (e.g. a geometry has spherical edges, or if an edge - * of geographic coordinates crosses the antimeridian). + * A custom WKB-encoded polygon or multi-polygon to represent a covering of + * geometries. For example, it may be a bounding box, or an evelope of geometries + * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if + * an edge of geographic coordinates crosses the antimeridian). In addition, it can + * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ -struct Geometry { +struct Covering { /** Bytes of a WKB-encoded geometry */ 1: required binary geometry; - /** - * Edges of the geometry if it is a polygon. It may be different to the - * edges attribute from the GEOMETRY logical type. - */ - 2: optional Edges edges; + /** Edges of the geometry, which is independent of edges from the logical type */ + 2: required Edges edges; } /** @@ -279,15 +276,13 @@ struct BoundingBox { 8: optional double mmax; } -struct Covering { - optional BoundingBox bbox // A bounding box of geometries if it can be built. - optional Geometry covering // A covering polygon of geometries if bbox is unavailable. -} - /** Statistics specific to GEOMETRY logical type */ struct GeometryStatistics { - /** Covering of geometries */ - 1: optional Covering covering; + /** A bounding box of geometries */ + 1: optional BoundingBox bbox; + + /** A covering polygon of geometries */ + 2: optional Covering covering; /** * The geometry types of all geometries, or an empty array if they are not @@ -306,7 +301,7 @@ struct GeometryStatistics { * Please refer to link below for more detail: * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 2: optional list geometry_types; + 3: optional list geometry_types; } /** @@ -456,7 +451,7 @@ struct BsonType { } /** - * Phyiscal type and encoding for the geometry type. + * Physical type and encoding for the geometry type. */ enum GeometryEncoding { /** @@ -497,7 +492,7 @@ struct GeometryType { * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 4: optional string metadata; + 4: optional binary metadata; } /** From e817af45c60636139c0283bea67e378f63089299 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Jun 2024 15:31:20 +0800 Subject: [PATCH 07/33] fix typo --- src/main/thrift/parquet.thrift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 11ed38f1a..4ecd3a1f7 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -455,7 +455,7 @@ struct BsonType { */ enum GeometryEncoding { /** - * Allowed for phyiscal type: BYTE_ARRAY. + * Allowed for physical type: BYTE_ARRAY. * * Well-known binary (WKB) representations of geometries. It supports 2D or * 3D geometries of the standard geometry types (Point, LineString, Polygon, @@ -475,7 +475,7 @@ enum GeometryEncoding { */ struct GeometryType { /** - * Phyiscal type and encoding for the geometry type. Please refer to the + * Physical type and encoding for the geometry type. Please refer to the * definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; From f78f7bd4c4430088142233ea127f227ef6fc889b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 19 Jun 2024 22:17:03 +0800 Subject: [PATCH 08/33] use WKB type code --- src/main/thrift/parquet.thrift | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 4ecd3a1f7..b1528aad9 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -286,22 +286,32 @@ struct GeometryStatistics { /** * The geometry types of all geometries, or an empty array if they are not - * known. It follows the same rule of `geometry_types` column metadata of - * GeoParquet. Accepted geometry types are: "Point", "LineString", "Polygon", - * "MultiPoint", "MultiLineString", "MultiPolygon", "GeometryCollection". + * known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + * except that values in the list are WKB (ISO variant) integer codes [2]. Table + * below shows the most common geometry types and their codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | * * In addition, the following rules are used: - * - In case of 3D geometries, a `" Z"` suffix gets added (e.g. `["Point Z"]`). * - A list of multiple values indicates that multiple geometry types are - * present (e.g. `["Polygon", "MultiPolygon"]`). + * present (e.g. `[0003, 0006]`). * - An empty array explicitly signals that the geometry types are not known. - * - The geometry types in the list must be unique (e.g. `["Point", "Point"]` + * - The geometry types in the list must be unique (e.g. `[0001, 0001]` * is not valid). * - * Please refer to link below for more detail: - * https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 + * Please refer to links below for more detail: + * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 */ - 3: optional list geometry_types; + 3: optional list geometry_types; } /** @@ -480,14 +490,14 @@ struct GeometryType { */ 1: required GeometryEncoding encoding; /** - * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth, e.g. OGC:CRS84 + * Edges of polygon. */ - 2: optional string crs; + 2: required Edges edges; /** - * Edges of polygon. + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth, e.g. OGC:CRS84 */ - 3: optional Edges edges; + 3: optional string crs; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. From 1aaaca8859cea3693ea897d7b1f534aceb304618 Mon Sep 17 00:00:00 2001 From: Feng Zhang Date: Wed, 7 Aug 2024 07:57:19 -0700 Subject: [PATCH 09/33] Update covering and geometry type protocol based on comments (#2) --- src/main/thrift/parquet.thrift | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index b1528aad9..f9e901dc3 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -249,16 +249,22 @@ enum Edges { /** * A custom WKB-encoded polygon or multi-polygon to represent a covering of - * geometries. For example, it may be a bounding box, or an evelope of geometries - * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if + * geometries. For example, it may be a bounding box or an envelope of geometries + * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ struct Covering { - /** Bytes of a WKB-encoded geometry */ - 1: required binary geometry; - /** Edges of the geometry, which is independent of edges from the logical type */ - 2: required Edges edges; + /** + * A type of covering. Currently accepted values: "WKB". + */ + 1: required string kind; + /** A payload specific to kind: + * - WKB: well-known binary of a POLYGON that completely covers the contents. + * This will be interpreted according to the same CRS and edges defined by + * the logical type. + */ + 2: required binary value; } /** @@ -281,8 +287,8 @@ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - /** A covering polygon of geometries */ - 2: optional Covering covering; + /** A list of coverings of geometries */ + 2: optional list coverings; /** * The geometry types of all geometries, or an empty array if they are not @@ -495,14 +501,19 @@ struct GeometryType { 2: required Edges edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth, e.g. OGC:CRS84 + * precise locations on earth. */ 3: optional string crs; + /** + * Encoding used in the above crs field. + * Currently the only allowed value is "PROJJSON". + */ + 4: optional string crs_encoding; /** * Additional informative metadata. * It can be used by GeoParquet to offload some of the column metadata. */ - 4: optional binary metadata; + 5: optional binary metadata; } /** From ee5b2df8f972b4507ad7630bdc93d9bb765f8dfc Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Wed, 14 Aug 2024 22:25:16 -0700 Subject: [PATCH 10/33] Add the new suggestion according to the meeting with Snowflake (#3) --- src/main/thrift/parquet.thrift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index f9e901dc3..dc85274b2 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -270,6 +270,9 @@ struct Covering { /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. + * Filter pushdown on geometries are only safe for planar spatial predicate + * but it is recommended that the writer always generates bounding box statistics, + * regardless of whether the geometries are planar or spherical. */ struct BoundingBox { 1: required double xmin; From 19cc081d823b4affe43a7bb3134defdd1fba80ba Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 00:04:18 +0800 Subject: [PATCH 11/33] change metadata to string type and rewording WKB description --- src/main/thrift/parquet.thrift | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index dc85274b2..5783b5346 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -248,7 +248,7 @@ enum Edges { } /** - * A custom WKB-encoded polygon or multi-polygon to represent a covering of + * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can @@ -259,10 +259,11 @@ struct Covering { * A type of covering. Currently accepted values: "WKB". */ 1: required string kind; - /** A payload specific to kind: - * - WKB: well-known binary of a POLYGON that completely covers the contents. - * This will be interpreted according to the same CRS and edges defined by - * the logical type. + /** + * A payload specific to kind: + * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + * covers the contents. This will be interpreted according to the same CRS + * and edges defined by the logical type. */ 2: required binary value; } @@ -318,7 +319,7 @@ struct GeometryStatistics { * * Please refer to links below for more detail: * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 */ 3: optional list geometry_types; } @@ -483,6 +484,11 @@ enum GeometryEncoding { * * This encoding enables GeometryStatistics to be set in the column chunk * and page index. + * + * Please note that we follow the same rule of WKB and coordinate axis order + * of GeoParquet, see detail below: + * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 */ WKB = 0; @@ -514,9 +520,10 @@ struct GeometryType { 4: optional string crs_encoding; /** * Additional informative metadata. - * It can be used by GeoParquet to offload some of the column metadata. + * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string: + * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 */ - 5: optional binary metadata; + 5: optional string metadata; } /** From 16c5868e6ee11af40be9b6d11e80a8bd49b8ffe1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 22:50:56 +0800 Subject: [PATCH 12/33] add example for crs --- src/main/thrift/parquet.thrift | 40 +++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 5783b5346..12e9fb4a9 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -511,10 +511,48 @@ struct GeometryType { /** * Coordinate Reference System, i.e. mapping of how coordinates refer to * precise locations on earth. + * + * For example, OGC:CRS84 encoded in PROJJSON is set as below: + * { + * "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + * "type": "GeographicCRS", + * "name": "WGS 84 longitude-latitude", + * "datum": { + * "type": "GeodeticReferenceFrame", + * "name": "World Geodetic System 1984", + * "ellipsoid": { + * "name": "WGS 84", + * "semi_major_axis": 6378137, + * "inverse_flattening": 298.257223563 + * } + * }, + * "coordinate_system": { + * "subtype": "ellipsoidal", + * "axis": [ + * { + * "name": "Geodetic longitude", + * "abbreviation": "Lon", + * "direction": "east", + * "unit": "degree" + * }, + * { + * "name": "Geodetic latitude", + * "abbreviation": "Lat", + * "direction": "north", + * "unit": "degree" + * } + * ] + * }, + * "id": { + * "authority": "OGC", + * "code": "CRS84" + * } + * } */ 3: optional string crs; /** - * Encoding used in the above crs field. + * Encoding used in the above crs field. If MUST be set if crs is set. + * * Currently the only allowed value is "PROJJSON". */ 4: optional string crs_encoding; From 56a65de1791c6e30c8acbc2364909e951e2d27e3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 21 Aug 2024 23:04:22 +0800 Subject: [PATCH 13/33] reword crs --- src/main/thrift/parquet.thrift | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 12e9fb4a9..aa964d2ae 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -510,9 +510,9 @@ struct GeometryType { 2: required Edges edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth. - * - * For example, OGC:CRS84 encoded in PROJJSON is set as below: + * precise locations on earth. Writers are not required to set this field. + * Once crs is set, crs_encoding field below MUST be set together. + * For example, "OGC:CRS84" can be set in the form of PROJJSON as below: * { * "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", * "type": "GeographicCRS", @@ -551,8 +551,7 @@ struct GeometryType { */ 3: optional string crs; /** - * Encoding used in the above crs field. If MUST be set if crs is set. - * + * Encoding used in the above crs field. It MUST be set if crs field is set. * Currently the only allowed value is "PROJJSON". */ 4: optional string crs_encoding; From f28b2823a616229dc9bc1102a72658743ea01c00 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 22 Aug 2024 11:51:43 +0800 Subject: [PATCH 14/33] clarify WKB --- src/main/thrift/parquet.thrift | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index aa964d2ae..3b5936ac4 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -240,7 +240,7 @@ struct SizeStatistics { /** * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge * between points represent a straight cartesian line or the shortest line on - * the sphere. Please note that it only applies to polygons. + * the sphere. It applies to all non-point geometry objects. */ enum Edges { PLANAR = 0; @@ -260,7 +260,7 @@ struct Covering { */ 1: required string kind; /** - * A payload specific to kind: + * A payload specific to kind. Below are the supported values: * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely * covers the contents. This will be interpreted according to the same CRS * and edges defined by the logical type. @@ -477,22 +477,20 @@ enum GeometryEncoding { /** * Allowed for physical type: BYTE_ARRAY. * - * Well-known binary (WKB) representations of geometries. It supports 2D or - * 3D geometries of the standard geometry types (Point, LineString, Polygon, - * MultiPoint, MultiLineString, MultiPolygon, and GeometryCollection). This - * is the preferred option for maximum portability. + * Well-known binary (WKB) representations of geometries. * - * This encoding enables GeometryStatistics to be set in the column chunk - * and page index. + * To be clear, we follow the same rule of WKB and coordinate axis order from + * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the + * standard geometry types (Point, LineString, Polygon, MultiPoint, + * MultiLineString, MultiPolygon, and GeometryCollection). + * + * This is the preferred encoding for maximum portability. It also supports + * GeometryStatistics to be set in the column chunk and page index. * - * Please note that we follow the same rule of WKB and coordinate axis order - * of GeoParquet, see detail below: * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 */ WKB = 0; - - // TODO: add native encoding from GeoParquet/GeoArrow } /** @@ -500,12 +498,13 @@ enum GeometryEncoding { */ struct GeometryType { /** - * Physical type and encoding for the geometry type. Please refer to the - * definition of GeometryEncoding for more detail. + * Physical type and encoding for the geometry type. + * Please refer to the definition of GeometryEncoding for more detail. */ 1: required GeometryEncoding encoding; /** - * Edges of polygon. + * Edges of geometry type. + * Please refer to the definition of Edges for more detail. */ 2: required Edges edges; /** From 51277021614192b63120c93dd0a11c11e186080f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 24 Aug 2024 13:40:25 +0800 Subject: [PATCH 15/33] clarify coverings --- src/main/thrift/parquet.thrift | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 3b5936ac4..ec61f3175 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -250,7 +250,7 @@ enum Edges { /** * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries - * when a bounding box cannot be built (e.g., a geometry has spherical edges, or if + * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if * an edge of geographic coordinates crosses the antimeridian). In addition, it can * also be used to provide vendor-agnostic coverings like S2 or H3 grids. */ @@ -291,7 +291,12 @@ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - /** A list of coverings of geometries */ + /** + * A list of coverings of geometries. + * Note that It is allowed to have more than one covering of the same kind and + * implementation is free to use any of them. It is recommended to have at most + * one covering for each kind. + */ 2: optional list coverings; /** From 298ab6402047f5170418cd67cb8a2d5ed10ce716 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Tue, 10 Sep 2024 22:12:31 -0700 Subject: [PATCH 16/33] Update the suggestion for bbox stats (#4) * Add the new suggestion according to the meeting with Snowflake * Refine the description according to the suggestion --- src/main/thrift/parquet.thrift | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index ec61f3175..715ec97a0 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -271,9 +271,8 @@ struct Covering { /** * Bounding box of geometries in the representation of min/max value pair of * coordinates from each axis. Values of Z and M are omitted for 2D geometries. - * Filter pushdown on geometries are only safe for planar spatial predicate - * but it is recommended that the writer always generates bounding box statistics, - * regardless of whether the geometries are planar or spherical. + * Filter pushdown on geometries using this is only safe for planar spatial + * filters. */ struct BoundingBox { 1: required double xmin; From 41c639427a188e86373f41ab7580264371bb0a2a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:08:47 +0800 Subject: [PATCH 17/33] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 715ec97a0..cb69fd431 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -484,9 +484,13 @@ enum GeometryEncoding { * Well-known binary (WKB) representations of geometries. * * To be clear, we follow the same rule of WKB and coordinate axis order from - * GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM and the - * standard geometry types (Point, LineString, Polygon, MultiPoint, - * MultiLineString, MultiPolygon, and GeometryCollection). + * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] + * supporting XY, XYZ, XYM, XYZM and the standard geometry types + * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, + * and GeometryCollection). Coordinate order is always (x, y) where x is + * easting or longitude and y is northing or latitude. This ordering explicitly + * overrides the axis order as specified in the CRS following the GeoPackage + * specification [5]. * * This is the preferred encoding for maximum portability. It also supports * GeometryStatistics to be set in the column chunk and page index. From d86abe4a5bbddd0fb402137b0e693dbfc29ab0ba Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:08:58 +0800 Subject: [PATCH 18/33] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index cb69fd431..8e126b535 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -497,6 +497,9 @@ enum GeometryEncoding { * * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + * [3] https://portal.ogc.org/files/?artifact_id=18241 + * [4] https://www.iso.org/standard/60343.html + * [5] https://www.geopackage.org/spec130/#gpb_spec */ WKB = 0; } From c7a4f4cc99c753320c2e0a803c6de22a1c0d728c Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:09:29 +0800 Subject: [PATCH 19/33] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 8e126b535..e6e9dcb51 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -238,9 +238,16 @@ struct SizeStatistics { } /** - * Interpretation for edges of GEOMETRY logical type, i.e. whether the edge - * between points represent a straight cartesian line or the shortest line on - * the sphere. It applies to all non-point geometry objects. + * Interpretation for edges of elements of a GEOMETRY logical type. In other + * words, whether a point between two vertices should be interpolated in + * its XY dimensions as if it were a Cartesian line connecting the two + * vertices (planar) or the shortest spherical arc between the longitude + * and latitude represented by the two vertices (spherical). This value + * applies to all non-point geometry objects and is independent of the + * coordinate reference system. + * + * Because most systems currently assume planar edges and do not support + * spherical edges, planar should be used as the default value. */ enum Edges { PLANAR = 0; From f20f68598452049e29c999abb22faf0efa6c8cc9 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:09:53 +0800 Subject: [PATCH 20/33] Update src/main/thrift/parquet.thrift Co-authored-by: Dewey Dunnington --- src/main/thrift/parquet.thrift | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index e6e9dcb51..8c289144d 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -258,8 +258,9 @@ enum Edges { * A custom binary-encoded polygon or multi-polygon to represent a covering of * geometries. For example, it may be a bounding box or an envelope of geometries * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if - * an edge of geographic coordinates crosses the antimeridian). In addition, it can - * also be used to provide vendor-agnostic coverings like S2 or H3 grids. + * an edge of geographic coordinates crosses the antimeridian). It may be + * extended in future versions to provide vendor-agnostic coverings like + * vectors of cells on a discrete global grid (e.g., S2 or H3 cells). */ struct Covering { /** From dbf9d54ff725afd0067551a283936cb865b4869e Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 20 Sep 2024 23:46:49 +0800 Subject: [PATCH 21/33] address feedback about edges and wkb --- src/main/thrift/parquet.thrift | 77 ++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 8c289144d..a6c2822ec 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,6 +237,36 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } +/** + * Physical type and encoding for the geometry type. + */ +enum GeometryEncoding { + /** + * Allowed for physical type: BYTE_ARRAY. + * + * Well-known binary (WKB) representations of geometries. + * + * To be clear, we follow the same rule of WKB and coordinate axis order from + * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] + * supporting XY, XYZ, XYM, XYZM and the standard geometry types + * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, + * and GeometryCollection). Coordinate order is always (x, y) where x is + * easting or longitude and y is northing or latitude. This ordering explicitly + * overrides the axis order as specified in the CRS following the GeoPackage + * specification [5]. + * + * This is the preferred encoding for maximum portability. It also supports + * GeometryStatistics to be set in the column chunk and page index. + * + * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + * [3] https://portal.ogc.org/files/?artifact_id=18241 + * [4] https://www.iso.org/standard/60343.html + * [5] https://www.geopackage.org/spec130/#gpb_spec + */ + WKB = 0; +} + /** * Interpretation for edges of elements of a GEOMETRY logical type. In other * words, whether a point between two vertices should be interpolated in @@ -249,7 +279,7 @@ struct SizeStatistics { * Because most systems currently assume planar edges and do not support * spherical edges, planar should be used as the default value. */ -enum Edges { +enum EdgeInterpolation { PLANAR = 0; SPHERICAL = 1; } @@ -482,36 +512,6 @@ struct JsonType { struct BsonType { } -/** - * Physical type and encoding for the geometry type. - */ -enum GeometryEncoding { - /** - * Allowed for physical type: BYTE_ARRAY. - * - * Well-known binary (WKB) representations of geometries. - * - * To be clear, we follow the same rule of WKB and coordinate axis order from - * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] - * supporting XY, XYZ, XYM, XYZM and the standard geometry types - * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, - * and GeometryCollection). Coordinate order is always (x, y) where x is - * easting or longitude and y is northing or latitude. This ordering explicitly - * overrides the axis order as specified in the CRS following the GeoPackage - * specification [5]. - * - * This is the preferred encoding for maximum portability. It also supports - * GeometryStatistics to be set in the column chunk and page index. - * - * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 - * [3] https://portal.ogc.org/files/?artifact_id=18241 - * [4] https://www.iso.org/standard/60343.html - * [5] https://www.geopackage.org/spec130/#gpb_spec - */ - WKB = 0; -} - /** * Geometry logical type annotation (added in 2.11.0) */ @@ -522,10 +522,12 @@ struct GeometryType { */ 1: required GeometryEncoding encoding; /** - * Edges of geometry type. + * Interpretation for edges of elements of a GEOMETRY logical type, i.e. whether + * the interpolation between points along an edge represents a straight cartesian + * line or the shortest line on the sphere. * Please refer to the definition of Edges for more detail. */ - 2: required Edges edges; + 2: required EdgeInterpolation edges; /** * Coordinate Reference System, i.e. mapping of how coordinates refer to * precise locations on earth. Writers are not required to set this field. @@ -574,11 +576,12 @@ struct GeometryType { */ 4: optional string crs_encoding; /** - * Additional informative metadata. - * GeoParquet could offload its column metadata in a JSON-encoded UTF-8 string: - * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 + * Additional informative metadata as a list of key-value pair of UTF-8 string. + * It is not strictly required by the low-level Parquet implementation for + * features like statistics or filter pushdown. Using a list of key-value pair + * provides maximum flexibility for adding future informative metadata. */ - 5: optional string metadata; + 5: optional list key_value_metadata; } /** From b4296aa4da53bd7651c3a4dcfcfcd64919593655 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 27 Sep 2024 09:48:09 +0800 Subject: [PATCH 22/33] add geoparquet column metadata back --- src/main/thrift/parquet.thrift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index a6c2822ec..a95f368f1 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -577,9 +577,13 @@ struct GeometryType { 4: optional string crs_encoding; /** * Additional informative metadata as a list of key-value pair of UTF-8 string. + * * It is not strictly required by the low-level Parquet implementation for * features like statistics or filter pushdown. Using a list of key-value pair * provides maximum flexibility for adding future informative metadata. + * + * GeoParquet could store its column metadata in this field: + * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 */ 5: optional list key_value_metadata; } From 9bcea6eabb0ba6dc0a81bc427853b5ddeb921476 Mon Sep 17 00:00:00 2001 From: Jia Yu Date: Fri, 4 Oct 2024 02:56:13 -0700 Subject: [PATCH 23/33] Update the spec according to the new feedback (#5) * Update the spec according to the new feedback * Fix typo --- src/main/thrift/parquet.thrift | 114 +++++++-------------------------- 1 file changed, 24 insertions(+), 90 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index a95f368f1..9d3029481 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -279,43 +279,26 @@ enum GeometryEncoding { * Because most systems currently assume planar edges and do not support * spherical edges, planar should be used as the default value. */ -enum EdgeInterpolation { +enum Edges { PLANAR = 0; SPHERICAL = 1; } -/** - * A custom binary-encoded polygon or multi-polygon to represent a covering of - * geometries. For example, it may be a bounding box or an envelope of geometries - * when a bounding box cannot be built (e.g. a geometry has spherical edges, or if - * an edge of geographic coordinates crosses the antimeridian). It may be - * extended in future versions to provide vendor-agnostic coverings like - * vectors of cells on a discrete global grid (e.g., S2 or H3 cells). - */ -struct Covering { - /** - * A type of covering. Currently accepted values: "WKB". - */ - 1: required string kind; - /** - * A payload specific to kind. Below are the supported values: - * - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely - * covers the contents. This will be interpreted according to the same CRS - * and edges defined by the logical type. - */ - 2: required binary value; -} - /** * Bounding box of geometries in the representation of min/max value pair of - * coordinates from each axis. Values of Z and M are omitted for 2D geometries. - * Filter pushdown on geometries using this is only safe for planar spatial - * filters. + * coordinates from each axis when Edges is planar. Values of Z and M are omitted + * for 2D geometries. When Edges is spherical, the bounding box is in the form of + * [westmost, eastmost, southmost, northmost], with necessary min/max values for + * Z and M if needed. */ struct BoundingBox { + /** Westmost value if edges = spherical **/ 1: required double xmin; + /** Eastmost value if edges = spherical **/ 2: required double xmax; + /** Southmost value if edges = spherical **/ 3: required double ymin; + /** Northmost value if edges = spherical **/ 4: required double ymax; 5: optional double zmin; 6: optional double zmax; @@ -328,14 +311,6 @@ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - /** - * A list of coverings of geometries. - * Note that It is allowed to have more than one covering of the same kind and - * implementation is free to use any of them. It is recommended to have at most - * one covering for each kind. - */ - 2: optional list coverings; - /** * The geometry types of all geometries, or an empty array if they are not * known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] @@ -363,7 +338,7 @@ struct GeometryStatistics { * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 */ - 3: optional list geometry_types; + 2: optional list geometry_types; } /** @@ -527,65 +502,24 @@ struct GeometryType { * line or the shortest line on the sphere. * Please refer to the definition of Edges for more detail. */ - 2: required EdgeInterpolation edges; - /** - * Coordinate Reference System, i.e. mapping of how coordinates refer to - * precise locations on earth. Writers are not required to set this field. - * Once crs is set, crs_encoding field below MUST be set together. - * For example, "OGC:CRS84" can be set in the form of PROJJSON as below: - * { - * "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", - * "type": "GeographicCRS", - * "name": "WGS 84 longitude-latitude", - * "datum": { - * "type": "GeodeticReferenceFrame", - * "name": "World Geodetic System 1984", - * "ellipsoid": { - * "name": "WGS 84", - * "semi_major_axis": 6378137, - * "inverse_flattening": 298.257223563 - * } - * }, - * "coordinate_system": { - * "subtype": "ellipsoidal", - * "axis": [ - * { - * "name": "Geodetic longitude", - * "abbreviation": "Lon", - * "direction": "east", - * "unit": "degree" - * }, - * { - * "name": "Geodetic latitude", - * "abbreviation": "Lat", - * "direction": "north", - * "unit": "degree" - * } - * ] - * }, - * "id": { - * "authority": "OGC", - * "code": "CRS84" - * } - * } - */ - 3: optional string crs; - /** - * Encoding used in the above crs field. It MUST be set if crs field is set. - * Currently the only allowed value is "PROJJSON". - */ - 4: optional string crs_encoding; + 2: required Edges edges; /** - * Additional informative metadata as a list of key-value pair of UTF-8 string. + * CRS (coordinate reference system) is a mapping of how coordinates refer to + * precise locations on earth. A crs is specified by a string, which is a Parquet + * file metadata field whose value is the crs representation. An additional field + * with the suffix '.type' describes the encoding of this CRS representation. * - * It is not strictly required by the low-level Parquet implementation for - * features like statistics or filter pushdown. Using a list of key-value pair - * provides maximum flexibility for adding future informative metadata. + * For example, if a geometry column (e.g., 'geom1') uses the CRS 'OGC:CRS84', the + * writer may create 2 file metadata fields: 'geom1_crs' and 'geom1_crs.type', and + * set the 'crs' field to 'geom1_crs'. The 'geom1_crs' field will contain the + * PROJJSON representation of OGC:CRS84 + * (https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details), + * and the 'geom1_crs.type' field will contain the string 'PROJJSON'. * - * GeoParquet could store its column metadata in this field: - * https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L46 + * Multiple geometry columns can refer to the same CRS metadata field + * (e.g., 'geom1_crs') if they share the same CRS. */ - 5: optional list key_value_metadata; + 3: optional string crs; } /** From 99f04032a1685833f7f5e2340fbd4883e8770065 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 12 Oct 2024 13:35:03 +0800 Subject: [PATCH 24/33] Update src/main/thrift/parquet.thrift Co-authored-by: emkornfield --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 9d3029481..49ca25660 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -506,7 +506,7 @@ struct GeometryType { /** * CRS (coordinate reference system) is a mapping of how coordinates refer to * precise locations on earth. A crs is specified by a string, which is a Parquet - * file metadata field whose value is the crs representation. An additional field + * file metadata field whose value is the CRS representation. An additional field * with the suffix '.type' describes the encoding of this CRS representation. * * For example, if a geometry column (e.g., 'geom1') uses the CRS 'OGC:CRS84', the From dbb78cf8305a75a827c5ec5fe3b75a2033ec8cba Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 12 Oct 2024 13:35:13 +0800 Subject: [PATCH 25/33] Update src/main/thrift/parquet.thrift Co-authored-by: emkornfield --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 49ca25660..07ea3fbc8 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -505,7 +505,7 @@ struct GeometryType { 2: required Edges edges; /** * CRS (coordinate reference system) is a mapping of how coordinates refer to - * precise locations on earth. A crs is specified by a string, which is a Parquet + * precise locations on earth. A CRS is specified by a string, which is a Parquet * file metadata field whose value is the CRS representation. An additional field * with the suffix '.type' describes the encoding of this CRS representation. * From 25df0ff101f539b1b318f99e6e8bfe1c97c10c37 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 13 Oct 2024 21:04:13 +0800 Subject: [PATCH 26/33] add description to LogicalTypes.md --- LogicalTypes.md | 159 +++++++++++++++++++++++++++++++++ src/main/thrift/parquet.thrift | 145 +++++++----------------------- 2 files changed, 190 insertions(+), 114 deletions(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index b55a90884..1e7cdfafd 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -767,6 +767,165 @@ optional group my_map (MAP_KEY_VALUE) { } ``` +## Geospatial Types + +### GEOMETRY + +`GEOMETRY` is used for geometry features from [OGC – Simple feature access][simple-feature-access]. +See [Geospatial Notes](#geospatial-notes). + +The type has three type parameters: +- `encoding`: A required enum value for annonated physical type and encoding + for the `GEOMETRY` type. See [Geometry Encoding](#geometry-encoding). +- `edges`: A required enum value for interpretation for edges of elements of the + `GEOMETRY` type, i.e. whether the interpolation between points along + an edge represents a straight cartesian line or the shortest line on + the sphere. See [Edges](#edges). +- `crs`: An optional string value for CRS (coordinate reference system), which + is a mapping of how coordinates refer to precise locations on earth. + See [Coordinate Reference System](#coordinate-reference-system). + +The sort order used for `GEOMETRY` is undefined. When writing data, no min/max +statistics should be saved for this type and if such non-compliant statistics +are found during reading, they must be ignored. Instead, [GeometryStatistics](#geometry-statistics) +is introduced for `GEOMETRY` type. + +#### Geometry Encoding + +Physical type and encoding for the `GEOMETRY` type. Supported values: +- `WKB`: `GEOMETRY` type with `WKB` encoding can only be used to annotate the + `BYTE_ARRAY` primitive type. See [WKB](#well-known-binary-wkb). + +##### Well-known binary (WKB) + +Well-known binary (WKB) representations of geometries, see [Geospatial Notes](#geospatial-notes). + +To be clear, we follow the same definitions of GeoParquet for [WKB][geoparquet-wkb] +and [coordinate axis order][coordinate-axis-order]: +- Geometries SHOULD be encoded as ISO WKB supporting XY, XYZ, XYM, XYZM. Supported +standard geometry types: Point, LineString, Polygon, MultiPoint, MultiLineString, +MultiPolygon, and GeometryCollection. +- Coordinate axis order is always (x, y) where x is easting or longitude, and +y is northing or latitude. This ordering explicitly overrides the axis order +as specified in the CRS following the [GeoPackage specification][geopackage-spec]. + +This is the preferred encoding for maximum portability. + +[geoparquet-wkb]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 +[coordinate-axis-order]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 +[geopackage-spec]: https://www.geopackage.org/spec130/#gpb_spec + +#### Edges + +Interpretation for edges of elements of `GEOMETRY` type. In other words, it +specifies how a point between two vertices should be interpolated in its XY +dimensions. Supported values and corresponding interpolation approaches are: +- `PLANAR`: a Cartesian line connecting the two vertices. +- `SPHERICAL`: a shortest spherical arc between the longitude and latitude + represented by the two vertices. + +This value applies to all non-point geometry objects and is independent of the +[Coordinate Reference System](#coordinate-reference-system). + +Because most systems currently assume planar edges and do not support spherical +edges, `PLANAR` should be used as the default value. + +#### Coordinate Reference System + +CRS (coordinate reference system) is a mapping of how coordinates refer to +precise locations on earth. A CRS is specified by a key-value entry in the +`key_value_metadata` field of `FileMetaData` whose key is a short name of +the CRS and value is the CRS representation. An additional entry in the +`key_value_metadata` field with the suffix ".type" is required to describe +the encoding of this CRS representation. + +For example, if a geometry column (e.g., "geom1") uses the CRS "OGC:CRS84", the +writer may write two entries to `key_value_metadata` field of `FileMetaData` as +below, and set the `crs` field of the `GEOMETRY` type to "geom1_crs": +``` + "geom1_crs": an UTF-8 encoded PROJJSON representation of OGC:CRS84 + "geom1_crs.type": "PROJJSON" +``` + +The PROJJSON representation of OGC:CRS84 can be seen at [OGC:CRS84][ogc-crs84]. +Multiple geometry columns can refer to the same CRS metadata field +(e.g., "geom1_crs") if they share the same CRS. + +[ogc-crs84]: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details + +#### Geometry Statistics + +`GeometryStatistics` is an optional field of `Statistics` for `GEOMETRY` type. +It contains [Bounding Box](#bounding-box) and [Geometry Types](#geometry-types). +Note that geometry statistics in the page index is not supported yet. + +##### Bounding Box + +Bounding box is defined as the thrift struct below in the representation of +min/max value pair of coordinates from each axis. Values of Z and M are omitted +for 2D geometries. + +```thrift +struct BoundingBox { + /** Min value when edges = PLANAR, westmost value if edges = SPHERICAL */ + 1: required double xmin; + /** Max value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + 2: required double xmax; + /** Min value when edges = PLANAR, southmost value if edges = SPHERICAL */ + 3: required double ymin; + /** Max value when edges = PLANAR, northmost value if edges = SPHERICAL */ + 4: required double ymax; + 5: optional double zmin; + 6: optional double zmax; + 7: optional double mmin; + 8: optional double mmax; +} +``` + +The meaning of each value depends on the `Edges` attribute of the `GEOMETRY` type: +- If Edges is `PLANAR`, the values are literally the actual min/max value from each axis. +- If Edges is `SPHERICAL`, the values for X and Y are `[westmost, eastmost, southmost, northmost]`, + with necessary min/max values for Z and M if needed. + +##### Geometry Types + +A list of geometry types from all geometries in the `GEOMETRY` column, or an +empty list if they are not known. + +This is borrowed from [geometry_types of GeoParquet][geometry-types] +except that values in the list are [WKB (ISO-variant) integer codes][wkb-integer-code]. +Table below shows the most common geometry types and their codes: + +| Type | XY | XYZ | XYM | XYZM | +| :----------------- | :--- | :--- | :--- | :--: | +| Point | 0001 | 1001 | 2001 | 3001 | +| LineString | 0002 | 1002 | 2002 | 3002 | +| Polygon | 0003 | 1003 | 2003 | 3003 | +| MultiPoint | 0004 | 1004 | 2004 | 3004 | +| MultiLineString | 0005 | 1005 | 2005 | 3005 | +| MultiPolygon | 0006 | 1006 | 2006 | 3006 | +| GeometryCollection | 0007 | 1007 | 2007 | 3007 | + +In addition, the following rules are applied: +- A list of multiple values indicates that multiple geometry types are present (e.g. `[0003, 0006]`). +- An empty array explicitly signals that the geometry types are not known. +- The geometry types in the list must be unique (e.g. `[0001, 0001]` is not valid). + +[geometry-types]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 +[wkb-integer-code]: https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + +#### Geospatial Notes + +The Geometry class hierarchy and its WKT and WKB serializations (ISO supporting +XY, XYZ, XYM, XYZM) are defined by [OpenGIS Implementation Specification for +Geographic information – Simple feature access – Part 1: Common architecture]( +https://portal.ogc.org/files/?artifact_id=25355), from [OGC (Open Geospatial +Consortium)](https://www.ogc.org/standard/sfa/). + +The version of the OGC standard first used here is 1.2.1, but future versions +may also used if the WKB representation remains wire-compatible. + + ## UNKNOWN (always null) Sometimes, when discovering the schema of existing data, values are always null diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 07ea3fbc8..6a601a141 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -237,68 +237,18 @@ struct SizeStatistics { 3: optional list definition_level_histogram; } -/** - * Physical type and encoding for the geometry type. - */ -enum GeometryEncoding { - /** - * Allowed for physical type: BYTE_ARRAY. - * - * Well-known binary (WKB) representations of geometries. - * - * To be clear, we follow the same rule of WKB and coordinate axis order from - * GeoParquet [1][2]. Geometries SHOULD be encoded as ISO WKB [3][4] - * supporting XY, XYZ, XYM, XYZM and the standard geometry types - * Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon, - * and GeometryCollection). Coordinate order is always (x, y) where x is - * easting or longitude and y is northing or latitude. This ordering explicitly - * overrides the axis order as specified in the CRS following the GeoPackage - * specification [5]. - * - * This is the preferred encoding for maximum portability. It also supports - * GeometryStatistics to be set in the column chunk and page index. - * - * [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 - * [3] https://portal.ogc.org/files/?artifact_id=18241 - * [4] https://www.iso.org/standard/60343.html - * [5] https://www.geopackage.org/spec130/#gpb_spec - */ - WKB = 0; -} - -/** - * Interpretation for edges of elements of a GEOMETRY logical type. In other - * words, whether a point between two vertices should be interpolated in - * its XY dimensions as if it were a Cartesian line connecting the two - * vertices (planar) or the shortest spherical arc between the longitude - * and latitude represented by the two vertices (spherical). This value - * applies to all non-point geometry objects and is independent of the - * coordinate reference system. - * - * Because most systems currently assume planar edges and do not support - * spherical edges, planar should be used as the default value. - */ -enum Edges { - PLANAR = 0; - SPHERICAL = 1; -} - /** * Bounding box of geometries in the representation of min/max value pair of - * coordinates from each axis when Edges is planar. Values of Z and M are omitted - * for 2D geometries. When Edges is spherical, the bounding box is in the form of - * [westmost, eastmost, southmost, northmost], with necessary min/max values for - * Z and M if needed. + * coordinates from each axis. */ struct BoundingBox { - /** Westmost value if edges = spherical **/ + /** Min value when edges = PLANAR, westmost value if edges = SPHERICAL */ 1: required double xmin; - /** Eastmost value if edges = spherical **/ + /** Max value when edges = PLANAR, eastmost value if edges = SPHERICAL */ 2: required double xmax; - /** Southmost value if edges = spherical **/ + /** Min value when edges = PLANAR, southmost value if edges = SPHERICAL */ 3: required double ymin; - /** Northmost value if edges = spherical **/ + /** Max value when edges = PLANAR, northmost value if edges = SPHERICAL */ 4: required double ymax; 5: optional double zmin; 6: optional double zmax; @@ -310,34 +260,7 @@ struct BoundingBox { struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; - - /** - * The geometry types of all geometries, or an empty array if they are not - * known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] - * except that values in the list are WKB (ISO variant) integer codes [2]. Table - * below shows the most common geometry types and their codes: - * - * | Type | XY | XYZ | XYM | XYZM | - * | :----------------- | :--- | :--- | :--- | :--: | - * | Point | 0001 | 1001 | 2001 | 3001 | - * | LineString | 0002 | 1002 | 2002 | 3002 | - * | Polygon | 0003 | 1003 | 2003 | 3003 | - * | MultiPoint | 0004 | 1004 | 2004 | 3004 | - * | MultiLineString | 0005 | 1005 | 2005 | 3005 | - * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | - * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | - * - * In addition, the following rules are used: - * - A list of multiple values indicates that multiple geometry types are - * present (e.g. `[0003, 0006]`). - * - An empty array explicitly signals that the geometry types are not known. - * - The geometry types in the list must be unique (e.g. `[0001, 0001]` - * is not valid). - * - * Please refer to links below for more detail: - * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - * [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 - */ + /** Geometry type codes of all geometries, or an empty list if not known */ 2: optional list geometry_types; } @@ -487,38 +410,35 @@ struct JsonType { struct BsonType { } +/** Physical type and encoding for the geometry type */ +enum GeometryEncoding { + /** + * Allowed for physical type: BYTE_ARRAY. + * + * Well-known binary (WKB) representations of geometries. + */ + WKB = 0; +} + +/** Interpretation for edges of elements of a GEOMETRY type */ +enum Edges { + PLANAR = 0; + SPHERICAL = 1; +} + /** - * Geometry logical type annotation (added in 2.11.0) + * GEOMETRY logical type annotation (added in 2.11.0) + * + * GeometryEncoding and Edges are required. CRS is optional. + * + * Once CRS is set, it MUST be a key to an entry in the `key_value_metadata` + * field of `FileMetaData`. + * + * See LogicalTypes.md for detail. */ struct GeometryType { - /** - * Physical type and encoding for the geometry type. - * Please refer to the definition of GeometryEncoding for more detail. - */ 1: required GeometryEncoding encoding; - /** - * Interpretation for edges of elements of a GEOMETRY logical type, i.e. whether - * the interpolation between points along an edge represents a straight cartesian - * line or the shortest line on the sphere. - * Please refer to the definition of Edges for more detail. - */ 2: required Edges edges; - /** - * CRS (coordinate reference system) is a mapping of how coordinates refer to - * precise locations on earth. A CRS is specified by a string, which is a Parquet - * file metadata field whose value is the CRS representation. An additional field - * with the suffix '.type' describes the encoding of this CRS representation. - * - * For example, if a geometry column (e.g., 'geom1') uses the CRS 'OGC:CRS84', the - * writer may create 2 file metadata fields: 'geom1_crs' and 'geom1_crs.type', and - * set the 'crs' field to 'geom1_crs'. The 'geom1_crs' field will contain the - * PROJJSON representation of OGC:CRS84 - * (https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details), - * and the 'geom1_crs.type' field will contain the string 'PROJJSON'. - * - * Multiple geometry columns can refer to the same CRS metadata field - * (e.g., 'geom1_crs') if they share the same CRS. - */ 3: optional string crs; } @@ -1123,7 +1043,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined - * GEOMETRY - undefined, use GeometryStatistics instead. + * GEOMETRY - undefined * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -1264,9 +1184,6 @@ struct ColumnIndex { * Same as repetition_level_histograms except for definitions levels. **/ 7: optional list definition_level_histograms; - - /** A list containing statistics of GEOMETRY logical type for each page */ - 8: optional list geometry_stats; } struct AesGcmV1 { From d349727f7fa5830fd52075759684975233ce57f3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 13 Oct 2024 22:25:16 +0800 Subject: [PATCH 27/33] add explanation for Z & M values --- LogicalTypes.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index 1e7cdfafd..2353fd5fa 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -861,6 +861,16 @@ Note that geometry statistics in the page index is not supported yet. ##### Bounding Box +A geometry has at least two coordinate dimensions: X and Y for 2D coordinates +of each point. + +A geometry can optionally have Z and / or M values associated with each point +in the geometry. The Z value introduces the third dimension coordinate. The Z +values usually are used to indicate the height, or elevation. M values are an +opportunity for a geometry to express a fourth dimension as a coordinate value. +These values can be used as a linear reference value (e.g., highway milepost +value), a timestamp, or some other value as defined by the CRS. + Bounding box is defined as the thrift struct below in the representation of min/max value pair of coordinates from each axis. Values of Z and M are omitted for 2D geometries. @@ -925,7 +935,6 @@ Consortium)](https://www.ogc.org/standard/sfa/). The version of the OGC standard first used here is 1.2.1, but future versions may also used if the WKB representation remains wire-compatible. - ## UNKNOWN (always null) Sometimes, when discovering the schema of existing data, values are always null From 9ea65599c532c4a83f37b09bedf1f5d40229a612 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 16 Oct 2024 22:56:19 +0800 Subject: [PATCH 28/33] move geo stats to ColumnMetaData --- LogicalTypes.md | 46 ++++++++++++++++++++++------------ src/main/thrift/parquet.thrift | 26 +++++++++++-------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index 2353fd5fa..1f4c0162f 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -796,6 +796,10 @@ Physical type and encoding for the `GEOMETRY` type. Supported values: - `WKB`: `GEOMETRY` type with `WKB` encoding can only be used to annotate the `BYTE_ARRAY` primitive type. See [WKB](#well-known-binary-wkb). +Note that geometry encoding is required for `GEOMETRY` type. In order to correctly +interpret geometry data, writer implementations SHOULD always set this field, and +reader implementations SHOULD fail for an unknown geometry encoding value. + ##### Well-known binary (WKB) Well-known binary (WKB) representations of geometries, see [Geospatial Notes](#geospatial-notes). @@ -830,6 +834,10 @@ This value applies to all non-point geometry objects and is independent of the Because most systems currently assume planar edges and do not support spherical edges, `PLANAR` should be used as the default value. +Note that edges is required for `GEOMETRY` type. In order to correctly +interpret geometry data, writer implementations SHOULD always set this field, +and reader implementations SHOULD fail for an unknown edges value. + #### Coordinate Reference System CRS (coordinate reference system) is a mapping of how coordinates refer to @@ -855,39 +863,45 @@ Multiple geometry columns can refer to the same CRS metadata field #### Geometry Statistics -`GeometryStatistics` is an optional field of `Statistics` for `GEOMETRY` type. -It contains [Bounding Box](#bounding-box) and [Geometry Types](#geometry-types). -Note that geometry statistics in the page index is not supported yet. +`GeometryStatistics` is a struct to store geometry statistics of a column chunk +of `GEOMETRY` type. It is an optional field of `ColumnMetaData` and contains +[Bounding Box](#bounding-box) and [Geometry Types](#geometry-types). ##### Bounding Box A geometry has at least two coordinate dimensions: X and Y for 2D coordinates -of each point. +of each point. A geometry can optionally have Z and / or M values associated +with each point in the geometry. + +The Z values introduce the third dimension coordinate. Usually they are used +to indicate the height, or elevation. -A geometry can optionally have Z and / or M values associated with each point -in the geometry. The Z value introduces the third dimension coordinate. The Z -values usually are used to indicate the height, or elevation. M values are an -opportunity for a geometry to express a fourth dimension as a coordinate value. -These values can be used as a linear reference value (e.g., highway milepost -value), a timestamp, or some other value as defined by the CRS. +M values are an opportunity for a geometry to express a fourth dimension as +a coordinate value. These values can be used as a linear reference value +(e.g., highway milepost value), a timestamp, or some other value as defined +by the CRS. Bounding box is defined as the thrift struct below in the representation of -min/max value pair of coordinates from each axis. Values of Z and M are omitted -for 2D geometries. +min/max value pair of coordinates from each axis. Note that X and Y Values +are always present. Z and M are omitted for 2D geometries. ```thrift struct BoundingBox { - /** Min value when edges = PLANAR, westmost value if edges = SPHERICAL */ + /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ 1: required double xmin; - /** Max value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Max Y value when edges = PLANAR, eastmost value if edges = SPHERICAL */ 2: required double xmax; - /** Min value when edges = PLANAR, southmost value if edges = SPHERICAL */ + /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ 3: required double ymin; - /** Max value when edges = PLANAR, northmost value if edges = SPHERICAL */ + /** Max Y value when edges = PLANAR, northmost value if edges = SPHERICAL */ 4: required double ymax; + /** Min Z value if the axis exists */ 5: optional double zmin; + /** Max Z value if the axis exists */ 6: optional double zmax; + /** Min M value if the axis exists */ 7: optional double mmin; + /** Max M value if the axis exists */ 8: optional double mmax; } ``` diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 6a601a141..2e21140f2 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -242,17 +242,21 @@ struct SizeStatistics { * coordinates from each axis. */ struct BoundingBox { - /** Min value when edges = PLANAR, westmost value if edges = SPHERICAL */ + /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ 1: required double xmin; - /** Max value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Max Y value when edges = PLANAR, eastmost value if edges = SPHERICAL */ 2: required double xmax; - /** Min value when edges = PLANAR, southmost value if edges = SPHERICAL */ + /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ 3: required double ymin; - /** Max value when edges = PLANAR, northmost value if edges = SPHERICAL */ + /** Max Y value when edges = PLANAR, northmost value if edges = SPHERICAL */ 4: required double ymax; + /** Min Z value if the axis exists */ 5: optional double zmin; + /** Max Z value if the axis exists */ 6: optional double zmax; + /** Min M value if the axis exists */ 7: optional double mmin; + /** Max M value if the axis exists */ 8: optional double mmax; } @@ -313,9 +317,6 @@ struct Statistics { 7: optional bool is_max_value_exact; /** If true, min_value is the actual minimum value for a column */ 8: optional bool is_min_value_exact; - - /** statistics specific to geometry logical type */ - 9: optional GeometryStatistics geometry_stats; } /** Empty structs to use as logical type annotations */ @@ -429,10 +430,12 @@ enum Edges { /** * GEOMETRY logical type annotation (added in 2.11.0) * - * GeometryEncoding and Edges are required. CRS is optional. + * GeometryEncoding and Edges are required. In order to correctly interpret + * geometry data, writer implementations SHOULD always them, and reader + * implementations SHOULD fail for unknown values. * - * Once CRS is set, it MUST be a key to an entry in the `key_value_metadata` - * field of `FileMetaData`. + * CRS is optional. Once CRS is set, it MUST be a key to an entry in the + * `key_value_metadata` field of `FileMetaData`. * * See LogicalTypes.md for detail. */ @@ -913,6 +916,9 @@ struct ColumnMetaData { * filter pushdown. */ 16: optional SizeStatistics size_statistics; + + /** Optional statistics specific to GEOMETRY logical type */ + 17: optional GeometryStatistics geometry_stats; } struct EncryptionWithFooterKey { From 011de458d002a5c0936cf2575f0db97365c812d3 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 17 Oct 2024 09:15:08 +0800 Subject: [PATCH 29/33] Update src/main/thrift/parquet.thrift Co-authored-by: Jia Yu --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 2e21140f2..041b2cf8c 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -244,7 +244,7 @@ struct SizeStatistics { struct BoundingBox { /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ 1: required double xmin; - /** Max Y value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Max X value when edges = PLANAR, eastmost value if edges = SPHERICAL */ 2: required double xmax; /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ 3: required double ymin; From 6425a3cec7a1fde76bf2db99e1f12d9d7a1ca59a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 17 Oct 2024 09:18:58 +0800 Subject: [PATCH 30/33] fix typo --- LogicalTypes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index 1f4c0162f..1a03d8640 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -889,7 +889,7 @@ are always present. Z and M are omitted for 2D geometries. struct BoundingBox { /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ 1: required double xmin; - /** Max Y value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Max X value when edges = PLANAR, eastmost value if edges = SPHERICAL */ 2: required double xmax; /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ 3: required double ymin; From 15024589a55f7439d29dc1f5036a63e56b2cc27b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 22 Nov 2024 10:26:23 +0800 Subject: [PATCH 31/33] remove edges and simplify crs --- LogicalTypes.md | 73 +++++++++------------------------- src/main/thrift/parquet.thrift | 41 +++++++++---------- 2 files changed, 37 insertions(+), 77 deletions(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index 0f3fad845..287e0fc74 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -811,20 +811,18 @@ optional group my_map (MAP_KEY_VALUE) { See [Geospatial Notes](#geospatial-notes). The type has three type parameters: -- `encoding`: A required enum value for annonated physical type and encoding - for the `GEOMETRY` type. See [Geometry Encoding](#geometry-encoding). -- `edges`: A required enum value for interpretation for edges of elements of the - `GEOMETRY` type, i.e. whether the interpolation between points along - an edge represents a straight cartesian line or the shortest line on - the sphere. See [Edges](#edges). +- `encoding`: A required enum value for annonated physical type and encoding for + the `GEOMETRY` type. See [Geometry Encoding](#geometry-encoding). - `crs`: An optional string value for CRS (coordinate reference system), which - is a mapping of how coordinates refer to precise locations on earth. - See [Coordinate Reference System](#coordinate-reference-system). + is a mapping of how coordinates refer to precise locations on earth. See + [Coordinate Reference System](#coordinate-reference-system). +- `crs_encoding`: An optional string value to describes the encoding used by the + `crs` field. See [Coordinate Reference System](#coordinate-reference-system). The sort order used for `GEOMETRY` is undefined. When writing data, no min/max statistics should be saved for this type and if such non-compliant statistics -are found during reading, they must be ignored. Instead, [GeometryStatistics](#geometry-statistics) -is introduced for `GEOMETRY` type. +are found during reading, they must be ignored. [GeometryStatistics](#geometry-statistics) +is introduced to store geometry statistics for `GEOMETRY` type. #### Geometry Encoding @@ -855,45 +853,17 @@ This is the preferred encoding for maximum portability. [coordinate-axis-order]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 [geopackage-spec]: https://www.geopackage.org/spec130/#gpb_spec -#### Edges - -Interpretation for edges of elements of `GEOMETRY` type. In other words, it -specifies how a point between two vertices should be interpolated in its XY -dimensions. Supported values and corresponding interpolation approaches are: -- `PLANAR`: a Cartesian line connecting the two vertices. -- `SPHERICAL`: a shortest spherical arc between the longitude and latitude - represented by the two vertices. - -This value applies to all non-point geometry objects and is independent of the -[Coordinate Reference System](#coordinate-reference-system). - -Because most systems currently assume planar edges and do not support spherical -edges, `PLANAR` should be used as the default value. - -Note that edges is required for `GEOMETRY` type. In order to correctly -interpret geometry data, writer implementations SHOULD always set this field, -and reader implementations SHOULD fail for an unknown edges value. - #### Coordinate Reference System CRS (coordinate reference system) is a mapping of how coordinates refer to -precise locations on earth. A CRS is specified by a key-value entry in the -`key_value_metadata` field of `FileMetaData` whose key is a short name of -the CRS and value is the CRS representation. An additional entry in the -`key_value_metadata` field with the suffix ".type" is required to describe -the encoding of this CRS representation. - -For example, if a geometry column (e.g., "geom1") uses the CRS "OGC:CRS84", the -writer may write two entries to `key_value_metadata` field of `FileMetaData` as -below, and set the `crs` field of the `GEOMETRY` type to "geom1_crs": -``` - "geom1_crs": an UTF-8 encoded PROJJSON representation of OGC:CRS84 - "geom1_crs.type": "PROJJSON" -``` +locations on earth. A custom CRS is specified by a string value in the `crs` +field of the `GEOMETRY` type. An additional `crs_encoding` field describes the +encoding used for this CRS representation. Both fields are optional. If custom +CRS is not provided, CRS defaults to "OGC:CRS84". -The PROJJSON representation of OGC:CRS84 can be seen at [OGC:CRS84][ogc-crs84]. -Multiple geometry columns can refer to the same CRS metadata field -(e.g., "geom1_crs") if they share the same CRS. +For example, if a geometry column uses the CRS "OGC:CRS84", the writer may +write a PROJJSON representation of [OGC:CRS84][ogc-crs84] to the `crs` field +and set the `crs_encoding` field to "PROJJSON". [ogc-crs84]: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details @@ -923,13 +893,13 @@ are always present. Z and M are omitted for 2D geometries. ```thrift struct BoundingBox { - /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ + /** Westmost value (min longitude) on the X axis */ 1: required double xmin; - /** Max X value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Eastmost value (max longitude) on the X axis */ 2: required double xmax; - /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ + /** Southmost value (min latitude) on the Y axis */ 3: required double ymin; - /** Max Y value when edges = PLANAR, northmost value if edges = SPHERICAL */ + /** Northmost value (max latitude) on the Y axis */ 4: required double ymax; /** Min Z value if the axis exists */ 5: optional double zmin; @@ -942,11 +912,6 @@ struct BoundingBox { } ``` -The meaning of each value depends on the `Edges` attribute of the `GEOMETRY` type: -- If Edges is `PLANAR`, the values are literally the actual min/max value from each axis. -- If Edges is `SPHERICAL`, the values for X and Y are `[westmost, eastmost, southmost, northmost]`, - with necessary min/max values for Z and M if needed. - ##### Geometry Types A list of geometry types from all geometries in the `GEOMETRY` column, or an diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 6378a58eb..5d5e49094 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -242,13 +242,13 @@ struct SizeStatistics { * coordinates from each axis. */ struct BoundingBox { - /** Min X value when edges = PLANAR, westmost value if edges = SPHERICAL */ + /** Westmost value (min longitude) on the X axis */ 1: required double xmin; - /** Max X value when edges = PLANAR, eastmost value if edges = SPHERICAL */ + /** Eastmost value (max longitude) on the X axis */ 2: required double xmax; - /** Min Y value when edges = PLANAR, southmost value if edges = SPHERICAL */ + /** Southmost value (min latitude) on the Y axis */ 3: required double ymin; - /** Max Y value when edges = PLANAR, northmost value if edges = SPHERICAL */ + /** Northmost value (max latitude) on the Y axis */ 4: required double ymax; /** Min Z value if the axis exists */ 5: optional double zmin; @@ -411,6 +411,12 @@ struct JsonType { struct BsonType { } +/** + * Embedded Variant logical type annotation + */ +struct VariantType { +} + /** Physical type and encoding for the geometry type */ enum GeometryEncoding { /** @@ -421,34 +427,23 @@ enum GeometryEncoding { WKB = 0; } -/** Interpretation for edges of elements of a GEOMETRY type */ -enum Edges { - PLANAR = 0; - SPHERICAL = 1; -} - /** * GEOMETRY logical type annotation (added in 2.11.0) * - * GeometryEncoding and Edges are required. In order to correctly interpret - * geometry data, writer implementations SHOULD always them, and reader - * implementations SHOULD fail for unknown values. + * GeometryEncoding is required. In order to correctly interpret geometry data, + * writer implementations SHOULD always set it, and reader implementations + * SHOULD fail for unknown values. * - * CRS is optional. Once CRS is set, it MUST be a key to an entry in the - * `key_value_metadata` field of `FileMetaData`. + * CRS is optional. A custom CRS and its corresponding encoding can be set to + * crs and crs_encoding fields respectively. If missing, the CRS defaults to + * "OGC:CRS84". * * See LogicalTypes.md for detail. */ struct GeometryType { 1: required GeometryEncoding encoding; - 2: required Edges edges; - 3: optional string crs; -} - -/** - * Embedded Variant logical type annotation - */ -struct VariantType { + 2: optional string crs; + 3: optional string crs_encoding; } /** From 9f53c9ebc2a00c9602bc62845057418899946ff1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 13 Dec 2024 14:16:21 +0800 Subject: [PATCH 32/33] Add geography type --- Geospatial.md | 171 +++++++++++++++++++++++++++++++++ LogicalTypes.md | 45 +++++++++ src/main/thrift/parquet.thrift | 82 +++++++++++----- 3 files changed, 272 insertions(+), 26 deletions(-) create mode 100644 Geospatial.md diff --git a/Geospatial.md b/Geospatial.md new file mode 100644 index 000000000..3ecb821ad --- /dev/null +++ b/Geospatial.md @@ -0,0 +1,171 @@ + + +Geospatial Definitions +==== + +This document contains the specification of geospatial types and statistics. + +# Background + +The Geometry and Geography class hierarchy and its Well-Known Text (WKT) and +Well-Known Binary (WKB) serializations (ISO supporting XY, XYZ, XYM, XYZM) are +defined by [OpenGIS Implementation Specification for Geographic information – +Simple feature access – Part 1: Common architecture][sfa-part1], from [OGC +(Open Geospatial Consortium)][ogc]. + +The version of the OGC standard first used here is 1.2.1, but future versions +may also used if the WKB representation remains wire-compatible. + +[sfa-part1]: https://portal.ogc.org/files/?artifact_id=25355 +[ogc]: https://www.ogc.org/standard/sfa/ + +## Well-Known Binary + +Well-Known Binary (WKB) representations of geometries. + +Apache Parquet follows the same definitions of GeoParquet for [WKB][geoparquet-wkb] +and [coordinate axis order][coordinate-axis-order]: +- Geometries should be encoded as ISO WKB supporting XY, XYZ, XYM, XYZM. Supported +standard geometry types: Point, LineString, Polygon, MultiPoint, MultiLineString, +MultiPolygon, and GeometryCollection. +- Coordinate axis order is always (x, y) where x is easting or longitude, and +y is northing or latitude. This ordering explicitly overrides the axis order +as specified in the CRS following the [GeoPackage specification][geopackage-spec]. + +[geoparquet-wkb]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 +[coordinate-axis-order]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 +[geopackage-spec]: https://www.geopackage.org/spec130/#gpb_spec + +## Coordinate Reference System + +Coordinate Reference System (CRS) is a mapping of how coordinates refer to +locations on Earth. + +Apache Parquet supports CRS Customization by providing following attributes: +* `crs`: a CRS text representation. If unset, the CRS defaults to "OGC:CRS84". +* `crs_encoding`: a standard encoding used to represent the CRS text. If unset, + `crs` can be arbitrary string. + +For maximum interoperability of a custom CRS, it is recommended to provide +the CRS text with a standard encoding. Supported CRS encodings are: +* `SRID`: [Spatial reference identifier][srid], CRS text is the identifier itself. +* `PROJJSON`: [PROJJSON][projjson], CRS text is the projjson string. + +For example, if a Geometry or Geography column uses the CRS "OGC:CRS84", a writer +may write a PROJJSON representation of [OGC:CRS84][ogc-crs84] to the `crs` field +and set the `crs_encoding` field to `PROJJSON`. + +[srid]: https://en.wikipedia.org/wiki/Spatial_reference_system#Identifier +[projjson]: https://proj.org/en/stable/specifications/projjson.html +[ogc-crs84]: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details + +## Edge Interpolation Algorithm + +The edge interpolation algorithm is used for interpreting edges of elements of +a Geography column. It is applies to all non-point geometry objects and is +independent of the [Coordinate Reference System](#coordinate-reference-system). + +Supported values are: +* `spherical`: edges are interpolated as geodesics on a sphere. The radius of the underlying sphere is the mean radius of the spheroid defined by the CRS, defined as (2 * major_axis_length + minor_axis_length / 3). +* `vincenty`: [https://en.wikipedia.org/wiki/Vincenty%27s_formulae](https://en.wikipedia.org/wiki/Vincenty%27s_formulae) +* `thomas`: Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970. +* `andoyer`: Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. +* `karney`: [Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55](https://link.springer.com/content/pdf/10.1007/s00190-012-0578-z.pdf), and [GeographicLib](https://geographiclib.sourceforge.io/) + +# Logical Types + +Apache Parquet supports the following geospatial logical type annotations: +* `GEOMETRY`: Geometry features in the WKB format with linear/planar edges interpolation. See [Geometry logical type](LogicalTypes.md#geometry) +* `GEOGRAPHY`: Geometry features in the WKB format with non-linear/non-planar edges interpolation. See [Geography logical type](LogicalTypes.md#geography) + +# Statistics + +`GeometryStatistics` is a struct specific for `GEOMETRY` and `GEOGRAPHY` logical +types to store statistics of a column chunk. It is an optional field in the +`ColumnMetaData` and contains [Bounding Box](#bounding-box) and [Geometry +Types](#geometry-types). + +## Bounding Box + +A geometry has at least two coordinate dimensions: X and Y for 2D coordinates +of each point. A geometry can optionally have Z and / or M values associated +with each point in the geometry. + +The Z values introduce the third dimension coordinate. Usually they are used to +indicate the height, or elevation. + +M values are an opportunity for a geometry to express a fourth dimension as a +coordinate value. These values can be used as a linear reference value (e.g., +highway milepost value), a timestamp, or some other value as defined by the CRS. + +Bounding box is defined as the thrift struct below in the representation of +min/max value pair of coordinates from each axis. Note that X and Y Values are +always present. Z and M are omitted for 2D geometries. The concepts of westmost +and eastmost values are explicitly introduced for Geography logical type to +address cases involving antimeridian crossing, where xmin may be greater than +xmax. + +```thrift +struct BoundingBox { + /** Min X value for Geometry logical type, westmost value for Geography logical type */ + 1: required double xmin; + /** Max X value for Geometry logical type, eastmost value for Geography logical type */ + 2: required double xmax; + /** Min Y value for Geometry logical type, southmost value for Geography logical type */ + 3: required double ymin; + /** Max Y value for Geometry logical type, northmost value for Geography logical type */ + 4: required double ymax; + /** Min Z value if the axis exists */ + 5: optional double zmin; + /** Max Z value if the axis exists */ + 6: optional double zmax; + /** Min M value if the axis exists */ + 7: optional double mmin; + /** Max M value if the axis exists */ + 8: optional double mmax; +} +``` + +## Geometry Types + +A list of geometry types from all geometries in the `GEOMETRY` or `GEOGRAPHY` +column, or an empty list if they are not known. + +This is borrowed from [geometry_types of GeoParquet][geometry-types] except that +values in the list are [WKB (ISO-variant) integer codes][wkb-integer-code]. +Table below shows the most common geometry types and their codes: + +| Type | XY | XYZ | XYM | XYZM | +| :----------------- | :--- | :--- | :--- | :--: | +| Point | 0001 | 1001 | 2001 | 3001 | +| LineString | 0002 | 1002 | 2002 | 3002 | +| Polygon | 0003 | 1003 | 2003 | 3003 | +| MultiPoint | 0004 | 1004 | 2004 | 3004 | +| MultiLineString | 0005 | 1005 | 2005 | 3005 | +| MultiPolygon | 0006 | 1006 | 2006 | 3006 | +| GeometryCollection | 0007 | 1007 | 2007 | 3007 | + +In addition, the following rules are applied: +- A list of multiple values indicates that multiple geometry types are present (e.g. `[0003, 0006]`). +- An empty array explicitly signals that the geometry types are not known. +- The geometry types in the list must be unique (e.g. `[0001, 0001]` is not valid). + +[geometry-types]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 +[wkb-integer-code]: https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary diff --git a/LogicalTypes.md b/LogicalTypes.md index 287e0fc74..c7a53d516 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -599,6 +599,51 @@ optional group variant_shredded (VARIANT) { } ``` +### GEOMETRY + +`GEOMETRY` is used for geometry features in the Well-Known Binary (WKB) format +with linear/planar edges interpolation. See [Geospatial.md](Geospatial.md) for +more detail. + +The type has two type parameters: +- `crs`: An optional string value for Coordinate Reference System (CRS), which + is a mapping of how coordinates refer to locations on Earth. If unset, the CRS + defaults to "OGC:CRS84", which means that the geometries must be stored in + longitude, latitude based on the WGS84 datum. +- `crs_encoding`: An optional enum value to describes the encoding used by the + `crs` field. Supported values are: `SRID`, `PROJJSON`. If unset, `crs` can be + arbitrary string. + +The sort order used for `GEOMETRY` is undefined. When writing data, no min/max +statistics should be saved for this type and if such non-compliant statistics +are found during reading, they must be ignored. + +[`GeometryStatistics`](Geospatial.md#statistics) is introduced to store statistics +for `GEOMETRY` type. + +### GEOGRAPHY + +`GEOGRAPHY` is used for geography features in the WKB format with non-linear/non-planar +edges interpolation. + +The type has three type parameters: +- `crs`: An optional string value for CRS, similar to `GEOMETRY` type. It must + be a geographic CRS, where longitudes are bound by [-180, 180] and latitudes + are bound by [-90, 90]. +- `crs_encoding`: An optional enum value, similar to `GEOMETRY` type. +- `algorithm`: A required enum value to describes the edge interpolation + algorithm. Supported values are: `SPHERICAL`, `VINCENTY`, `THOMAS`, `ANDOYER`, + `KARNEY`. In order to correctly interpret edges interpolation of the geometries, + writer implementations should always set it and reader implementations should + fail for unknown values. + +The sort order used for `GEOGRAPHY` is undefined. When writing data, no min/max +statistics should be saved for this type and if such non-compliant statistics +are found during reading, they must be ignored. + +[`GeometryStatistics`](Geospatial.md#statistics) is introduced to store statistics +for `GEOGRAPHY` type. + ## Nested Types This section specifies how `LIST` and `MAP` can be used to encode nested types diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 5d5e49094..8cc80fb86 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -242,13 +242,13 @@ struct SizeStatistics { * coordinates from each axis. */ struct BoundingBox { - /** Westmost value (min longitude) on the X axis */ + /** Min X value for Geometry logical type, westmost value for Geography logical type */ 1: required double xmin; - /** Eastmost value (max longitude) on the X axis */ + /** Max X value for Geometry logical type, eastmost value for Geography logical type */ 2: required double xmax; - /** Southmost value (min latitude) on the Y axis */ + /** Min Y value for Geometry logical type, southmost value for Geography logical type */ 3: required double ymin; - /** Northmost value (max latitude) on the Y axis */ + /** Max Y value for Geometry logical type, northmost value for Geography logical type */ 4: required double ymax; /** Min Z value if the axis exists */ 5: optional double zmin; @@ -260,7 +260,7 @@ struct BoundingBox { 8: optional double mmax; } -/** Statistics specific to GEOMETRY logical type */ +/** Statistics specific to Geometry and Geography logical types */ struct GeometryStatistics { /** A bounding box of geometries */ 1: optional BoundingBox bbox; @@ -417,33 +417,62 @@ struct BsonType { struct VariantType { } -/** Physical type and encoding for the geometry type */ -enum GeometryEncoding { - /** - * Allowed for physical type: BYTE_ARRAY. - * - * Well-known binary (WKB) representations of geometries. - */ - WKB = 0; +/** Coordinate reference system (CRS) encoding for Geometry and Geography logical types */ +enum CRSEncoding { + SRID = 0; + PROJJSON = 1; +} + +/** Edge interpolation algorithm for Geography logical type */ +enum EdgeInterpolationAlgorithm { + SPHERICAL = 0; + VINCENTY = 1; + THOMAS = 2; + ANDOYER = 3; + KARNEY = 4; } /** - * GEOMETRY logical type annotation (added in 2.11.0) + * Embedded Geometry logical type annotation + * + * Geometry features in the Well-Known Binary (WKB) format with linear/planar + * edges interpolation. * - * GeometryEncoding is required. In order to correctly interpret geometry data, - * writer implementations SHOULD always set it, and reader implementations - * SHOULD fail for unknown values. + * A custom CRS can be set to the crs field. If unset, the CRS defaults to + * "OGC:CRS84", which means that the geometries must be stored in longitude, + * latitude based on the WGS84 datum. * - * CRS is optional. A custom CRS and its corresponding encoding can be set to - * crs and crs_encoding fields respectively. If missing, the CRS defaults to - * "OGC:CRS84". + * crs_encoding is an auxillary field to help decode the crs text. If unset, the + * crs field can be arbitrary text. * - * See LogicalTypes.md for detail. + * Allowed for physical type: BYTE_ARRAY. */ struct GeometryType { - 1: required GeometryEncoding encoding; - 2: optional string crs; - 3: optional string crs_encoding; + 1: optional string crs; + 2: optional CRSEncoding crs_encoding; +} + +/** + * Embedded Geography logical type annotation + * + * Geometry features in the WKB format with non-linear/non-planar edges + * interpolation. + * + * Similar to the Geometry logical type, a custom CRS can be set to the crs and + * crs_encoding fields. However, Geography logical type must use a geographic + * CRS, where longitudes are bound by [-180, 180] and latitudes are bound by + * [-90, 90]. + * + * algorithm is required. In order to correctly interpret edges interpolation + * of the geometries, writer implementations should always set it and reader + * implementations should fail for unknown values. + * + * Allowed for physical type: BYTE_ARRAY. + */ +struct GeographyType { + 1: optional string crs; + 2: optional CRSEncoding crs_encoding; + 3: required EdgeInterpolationAlgorithm algorithm; } /** @@ -919,8 +948,8 @@ struct ColumnMetaData { */ 16: optional SizeStatistics size_statistics; - /** Optional statistics specific to GEOMETRY logical type */ - 17: optional GeometryStatistics geometry_stats; + /** Optional statistics specific for Geometry and Geography logical types */ + 17: optional GeometryStatistics geometry_statistics; } struct EncryptionWithFooterKey { @@ -1053,6 +1082,7 @@ union ColumnOrder { * MAP - undefined * VARIANT - undefined * GEOMETRY - undefined + * GEOGRAPHY - undefined * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true From a4f79ca8a9aae8c44b5e1d786718b7a428fed540 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 13 Dec 2024 14:21:31 +0800 Subject: [PATCH 33/33] remove wrong content --- LogicalTypes.md | 147 ------------------------------------------------ 1 file changed, 147 deletions(-) diff --git a/LogicalTypes.md b/LogicalTypes.md index c7a53d516..5c3ddcf42 100644 --- a/LogicalTypes.md +++ b/LogicalTypes.md @@ -848,153 +848,6 @@ optional group my_map (MAP_KEY_VALUE) { } ``` -## Geospatial Types - -### GEOMETRY - -`GEOMETRY` is used for geometry features from [OGC – Simple feature access][simple-feature-access]. -See [Geospatial Notes](#geospatial-notes). - -The type has three type parameters: -- `encoding`: A required enum value for annonated physical type and encoding for - the `GEOMETRY` type. See [Geometry Encoding](#geometry-encoding). -- `crs`: An optional string value for CRS (coordinate reference system), which - is a mapping of how coordinates refer to precise locations on earth. See - [Coordinate Reference System](#coordinate-reference-system). -- `crs_encoding`: An optional string value to describes the encoding used by the - `crs` field. See [Coordinate Reference System](#coordinate-reference-system). - -The sort order used for `GEOMETRY` is undefined. When writing data, no min/max -statistics should be saved for this type and if such non-compliant statistics -are found during reading, they must be ignored. [GeometryStatistics](#geometry-statistics) -is introduced to store geometry statistics for `GEOMETRY` type. - -#### Geometry Encoding - -Physical type and encoding for the `GEOMETRY` type. Supported values: -- `WKB`: `GEOMETRY` type with `WKB` encoding can only be used to annotate the - `BYTE_ARRAY` primitive type. See [WKB](#well-known-binary-wkb). - -Note that geometry encoding is required for `GEOMETRY` type. In order to correctly -interpret geometry data, writer implementations SHOULD always set this field, and -reader implementations SHOULD fail for an unknown geometry encoding value. - -##### Well-known binary (WKB) - -Well-known binary (WKB) representations of geometries, see [Geospatial Notes](#geospatial-notes). - -To be clear, we follow the same definitions of GeoParquet for [WKB][geoparquet-wkb] -and [coordinate axis order][coordinate-axis-order]: -- Geometries SHOULD be encoded as ISO WKB supporting XY, XYZ, XYM, XYZM. Supported -standard geometry types: Point, LineString, Polygon, MultiPoint, MultiLineString, -MultiPolygon, and GeometryCollection. -- Coordinate axis order is always (x, y) where x is easting or longitude, and -y is northing or latitude. This ordering explicitly overrides the axis order -as specified in the CRS following the [GeoPackage specification][geopackage-spec]. - -This is the preferred encoding for maximum portability. - -[geoparquet-wkb]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 -[coordinate-axis-order]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 -[geopackage-spec]: https://www.geopackage.org/spec130/#gpb_spec - -#### Coordinate Reference System - -CRS (coordinate reference system) is a mapping of how coordinates refer to -locations on earth. A custom CRS is specified by a string value in the `crs` -field of the `GEOMETRY` type. An additional `crs_encoding` field describes the -encoding used for this CRS representation. Both fields are optional. If custom -CRS is not provided, CRS defaults to "OGC:CRS84". - -For example, if a geometry column uses the CRS "OGC:CRS84", the writer may -write a PROJJSON representation of [OGC:CRS84][ogc-crs84] to the `crs` field -and set the `crs_encoding` field to "PROJJSON". - -[ogc-crs84]: https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#ogccrs84-details - -#### Geometry Statistics - -`GeometryStatistics` is a struct to store geometry statistics of a column chunk -of `GEOMETRY` type. It is an optional field of `ColumnMetaData` and contains -[Bounding Box](#bounding-box) and [Geometry Types](#geometry-types). - -##### Bounding Box - -A geometry has at least two coordinate dimensions: X and Y for 2D coordinates -of each point. A geometry can optionally have Z and / or M values associated -with each point in the geometry. - -The Z values introduce the third dimension coordinate. Usually they are used -to indicate the height, or elevation. - -M values are an opportunity for a geometry to express a fourth dimension as -a coordinate value. These values can be used as a linear reference value -(e.g., highway milepost value), a timestamp, or some other value as defined -by the CRS. - -Bounding box is defined as the thrift struct below in the representation of -min/max value pair of coordinates from each axis. Note that X and Y Values -are always present. Z and M are omitted for 2D geometries. - -```thrift -struct BoundingBox { - /** Westmost value (min longitude) on the X axis */ - 1: required double xmin; - /** Eastmost value (max longitude) on the X axis */ - 2: required double xmax; - /** Southmost value (min latitude) on the Y axis */ - 3: required double ymin; - /** Northmost value (max latitude) on the Y axis */ - 4: required double ymax; - /** Min Z value if the axis exists */ - 5: optional double zmin; - /** Max Z value if the axis exists */ - 6: optional double zmax; - /** Min M value if the axis exists */ - 7: optional double mmin; - /** Max M value if the axis exists */ - 8: optional double mmax; -} -``` - -##### Geometry Types - -A list of geometry types from all geometries in the `GEOMETRY` column, or an -empty list if they are not known. - -This is borrowed from [geometry_types of GeoParquet][geometry-types] -except that values in the list are [WKB (ISO-variant) integer codes][wkb-integer-code]. -Table below shows the most common geometry types and their codes: - -| Type | XY | XYZ | XYM | XYZM | -| :----------------- | :--- | :--- | :--- | :--: | -| Point | 0001 | 1001 | 2001 | 3001 | -| LineString | 0002 | 1002 | 2002 | 3002 | -| Polygon | 0003 | 1003 | 2003 | 3003 | -| MultiPoint | 0004 | 1004 | 2004 | 3004 | -| MultiLineString | 0005 | 1005 | 2005 | 3005 | -| MultiPolygon | 0006 | 1006 | 2006 | 3006 | -| GeometryCollection | 0007 | 1007 | 2007 | 3007 | - -In addition, the following rules are applied: -- A list of multiple values indicates that multiple geometry types are present (e.g. `[0003, 0006]`). -- An empty array explicitly signals that the geometry types are not known. -- The geometry types in the list must be unique (e.g. `[0001, 0001]` is not valid). - -[geometry-types]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 -[wkb-integer-code]: https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - -#### Geospatial Notes - -The Geometry class hierarchy and its WKT and WKB serializations (ISO supporting -XY, XYZ, XYM, XYZM) are defined by [OpenGIS Implementation Specification for -Geographic information – Simple feature access – Part 1: Common architecture]( -https://portal.ogc.org/files/?artifact_id=25355), from [OGC (Open Geospatial -Consortium)](https://www.ogc.org/standard/sfa/). - -The version of the OGC standard first used here is 1.2.1, but future versions -may also used if the WKB representation remains wire-compatible. - ## UNKNOWN (always null) Sometimes, when discovering the schema of existing data, values are always null