Skip to content

Commit

Permalink
WIP: Add geometry logical type
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed May 10, 2024
1 parent 4cbb4cf commit 4d36df9
Showing 1 changed file with 96 additions and 2 deletions.
98 changes: 96 additions & 2 deletions src/main/thrift/parquet.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,11 @@ struct Statistics {
* may set min_value="B", max_value="C". Such more compact values must still be
* valid values within the column's logical type.
*
* Values are encoded using PLAIN encoding, except that variable-length byte
* arrays do not include a length prefix.
* Values are encoded using PLAIN encoding, except that:
* 1) variable-length byte arrays do not include a length prefix.
* 2) geometry logical type with BoundingBoxOrder uses max_value/min_value pair
* to store the bounding box for the column. Please refer to the definition
* of BoundingBoxOrder for detail.
*/
5: optional binary max_value;
6: optional binary min_value;
Expand Down Expand Up @@ -373,6 +376,69 @@ struct JsonType {
struct BsonType {
}

/**
* A geometry can be any of the following subtypes.
* The list of geospatial subtypes is taken from the OGC (Open Geospatial Consortium)
* SFA (Simple Feature Access) Part 1- Common Architecture.
*/
enum GeometrySubType {
POINT = 0;
LINESTRING = 1;
POLYGON = 2;
MULTIPOINT = 3;
MULTILINESTRING = 4;
MULTIPOLYGON = 5;
GEOMETRY_COLLECTION = 6;
}

/**
* Interpretation for edges, i.e. whether the edge between points
* represent a straight cartesian line or the shortest line on the sphere
*/
enum Edges {
PLANAR = 0;
// SPHERICAL = 1; // not supported yet
}

/**
* Well-Known Binary. This is a well-known and popular binary representation regulated
* by the Open Geospatial Consortium (OGC).
*/
struct WKB {}
/**
* Encoding for geospatial data.
*/
union GeospatialEncoding {
1: WKB WKB
}

/**
* Geometry logical type annotation
*
* Allowed for physical types: BINARY (added in 2.11.0)
*/
struct GeometryType {
/**
* The subtype of the geometry.
* If set, all values in the column must be of the same subtype.
* If not set, the column may contain values of any subtype.
*/
1: optional GeometrySubType subtype;
/**
* The dimension of the geometry.
* For now only 2D geometry is supported and the value must be 2 if set.
*/
2: optional int dimension;
/**
* Coordinate Reference System, i.e. mapping of how coordinates refer to
* precise locations on earth.
* For now only OGC:CRS84 is supported.
*/
3: optional string crs;
4: required Edges edges;
5: required GeospatialEncoding encoding;
}

/**
* LogicalType annotations to replace ConvertedType.
*
Expand Down Expand Up @@ -403,6 +469,7 @@ union LogicalType {
13: BsonType BSON // use ConvertedType BSON
14: UUIDType UUID // no compatible ConvertedType
15: Float16Type FLOAT16 // no compatible ConvertedType
16: GeometryType GEOMETRY // no compatible ConvertedType
}

/**
Expand Down Expand Up @@ -916,6 +983,8 @@ struct RowGroup {

/** Empty struct to signal the order defined by the physical or logical type */
struct TypeDefinedOrder {}
/** Empty struct to signal the order of GEOMETRY logical type */
struct BoundingBoxOrder {}

/**
* Union to specify the order used for the min_value and max_value fields for a
Expand All @@ -925,6 +994,8 @@ struct TypeDefinedOrder {}
* Possible values are:
* * TypeDefinedOrder - the column uses the order defined by its logical or
* physical type (if there is no logical type).
* * BoundingBoxOrder - the column uses the order to build bounding box
* (if the logical type is GEOMETRY).
*
* If the reader does not support the value of this union, min and max stats
* for this column should be ignored.
Expand Down Expand Up @@ -954,6 +1025,7 @@ union ColumnOrder {
* ENUM - unsigned byte-wise comparison
* LIST - undefined
* MAP - undefined
* GEOMETRY - undefined, as geometry objects cannot be compared directly
*
* In the absence of logical types, the sort order is determined by the physical type:
* BOOLEAN - false, true
Expand Down Expand Up @@ -982,6 +1054,23 @@ union ColumnOrder {
* `-0.0` should be written into the min statistics field.
*/
1: TypeDefinedOrder TYPE_ORDER;

/**
* The order only applies to GEOMETRY logical type.
*
* Please note that geometry objects cannot be compared directly. This order aims to
* provide an approach to build a bounding box for geometry objects in the same page
* or column chunk.
*
* In this order, all 2D geometries are regarded as a collection of coordinate (x, y).
* For example, POINT has one coordinate, LINESTRING has two coordinates, and POLYGON
* might have three or more coordinates. A bounding box is the combination of x_min,
* x_max, y_min, and y_max of all coordinates from all geometry values. For simplexty,
* min_value field in the Statistics/ColumnIndex is encoded as the concatenation of
* PLAIN-encoded DOUBLE-typed x_min and y_min values. Similarly, max_value field is
* encoded as the concatenation of PLAIN-encoded DOUBLE-typed x_max and y_max values.
*/
2: BoundingBoxOrder BBOX_ORDER;
}

struct PageLocation {
Expand Down Expand Up @@ -1039,6 +1128,9 @@ struct ColumnIndex {
* Such more compact values must still be valid values within the column's
* logical type. Readers must make sure that list entries are populated before
* using them by inspecting null_pages.
*
* For GEOMETRY logical type, these values are the bounding box of the column.
* Please refer to the definition of BoundingBoxOrder for detail.
*/
2: required list<binary> min_values
3: required list<binary> max_values
Expand All @@ -1048,6 +1140,8 @@ struct ColumnIndex {
* which direction. This allows readers to perform binary searches in both
* lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
* if the lists are ordered.
*
* For GEOMETRY type, UNORDERED is used at all times.
*/
4: required BoundaryOrder boundary_order

Expand Down

0 comments on commit 4d36df9

Please sign in to comment.