diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 27d404374..bd9fabdbd 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -270,8 +270,11 @@ struct Statistics { * may set min_value="B", max_value="C". Such more compact values must still be * valid values within the column's logical type. * - * Values are encoded using PLAIN encoding, except that variable-length byte - * arrays do not include a length prefix. + * Values are encoded using PLAIN encoding, except that: + * 1) variable-length byte arrays do not include a length prefix. + * 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair + * to store the bounding box for the column. Please refer to the definition + * of BoundingBoxOrder for detail. */ 5: optional binary max_value; 6: optional binary min_value; @@ -373,6 +376,69 @@ struct JsonType { struct BsonType { } +/** + * A geometry can be any of the following subtypes. + * The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium) + * SFA (Simple Feature Access) Part 1- Common Architecture. + */ +enum GeometrySubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRY_COLLECTION = 6; +} + +/** + * Interpretation for edges, i.e. whether the edge between points + * represent a straight cartesian line or the shortest line on the sphere + */ +enum Edges { + PLANAR = 0; + // SPHERICAL = 1; // not supported yet +} + +/** + * Well-Known Binary. This is a well-known and popular binary representation regulated + * by the Open Geospatial Consortium (OGC). + */ +struct WKB {} +/** + * Encoding for geospatial data. + */ +union GeospatialEncoding { + 1: WKB WKB +} + +/** + * Geometry logical type annotation + * + * Allowed for physical types: BINARY (added in 2.11.0) + */ +struct GeometryType { + /** + * The subtype of the geometry. + * If set, all values in the column must be of the same subtype. + * If not set, the column may contain values of any subtype. + */ + 1: optional GeometrySubType subtype; + /** + * The dimension of the geometry. + * For now only 2D geometry is supported and the value must be 2 if set. + */ + 2: optional int dimension; + /** + * Coordinate Reference System, i.e. mapping of how coordinates refer to + * precise locations on earth. + * For now only OGC:CRS84 is supported. + */ + 3: optional string crs; + 4: required Edges edges; + 5: required GeospatialEncoding encoding; +} + /** * LogicalType annotations to replace ConvertedType. * @@ -403,6 +469,7 @@ union LogicalType { 13: BsonType BSON // use ConvertedType BSON 14: UUIDType UUID // no compatible ConvertedType 15: Float16Type FLOAT16 // no compatible ConvertedType + 16: GeometryType GEOMETRY // no compatible ConvertedType } /** @@ -916,6 +983,8 @@ struct RowGroup { /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} +/** Empty struct to signal the order of GEOMETRY logical type */ +struct BoundingBoxOrder {} /** * Union to specify the order used for the min_value and max_value fields for a @@ -925,6 +994,8 @@ struct TypeDefinedOrder {} * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). + * * BoundingBoxOrder - the column uses the order to build bounding box + * (if the logical type is GEOMETRY). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. @@ -954,6 +1025,7 @@ union ColumnOrder { * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined + * GEOMETRY - undefined, as geometry objects cannot be compared directly * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true @@ -982,6 +1054,23 @@ union ColumnOrder { * `-0.0` should be written into the min statistics field. */ 1: TypeDefinedOrder TYPE_ORDER; + + /** + * The order only applies to GEOMETRY logical type. + * + * Please note that geometry objects cannot be compared directly. This order aims to + * provide an approach to build a bounding box for geometry objects in the same page + * or column chunk. + * + * In this order, all 2D geometries are regarded as a collection of coordinate (x, y). + * For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON + * might have three or more coordinates. A bounding box is the combination of x_min, + * x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty, + * min_value field in the Statistics/ColumnIndex is encoded as the concatenation of + * PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is + * encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values. + */ + 2: BoundingBoxOrder BBOX_ORDER; } struct PageLocation { @@ -1039,6 +1128,9 @@ struct ColumnIndex { * Such more compact values must still be valid values within the column's * logical type. Readers must make sure that list entries are populated before * using them by inspecting null_pages. + * + * For GEOMETRY logical type, these values are the bounding box of the column. + * Please refer to the definition of BoundingBoxOrder for detail. */ 2: required list min_values 3: required list max_values @@ -1048,6 +1140,8 @@ struct ColumnIndex { * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. + * + * For GEOMETRY type, UNORDERED is used at all times. */ 4: required BoundaryOrder boundary_order