diff --git a/docs/en/connector-v2/Config-Encryption-Decryption.md b/docs/en/connector-v2/Config-Encryption-Decryption.md index edb8061b46a..7574c53919c 100644 --- a/docs/en/connector-v2/Config-Encryption-Decryption.md +++ b/docs/en/connector-v2/Config-Encryption-Decryption.md @@ -183,3 +183,42 @@ If you want to customize the encryption method and the configuration of the encr 5. Package it to jar and add jar to `${SEATUNNEL_HOME}/lib` 6. Change the option `shade.identifier` to the value that you defined in `ConfigShade#getIdentifier`of you config file, please enjoy it \^_\^ +### How to encrypt and decrypt with customized params + +If you want to encrypt and decrypt with customized params, you can follow the steps below: +1. Add a configuration named `shade.properties` in the env part of the configuration file, the value of this configuration is in the form of key-value pairs (the type of the key must be a string), as shown below: + + ```hocon + env { + shade.properties = { + suffix = "666" + } + } + + ``` + +2. Override the `ConfigShade` interface's `open` method, as shown below: + + ```java + public static class ConfigShadeWithProps implements ConfigShade { + + private String suffix; + private String identifier = "withProps"; + + @Override + public void open(Map props) { + this.suffix = String.valueOf(props.get("suffix")); + } + } + ``` +3. Use the parameters passed in the open method in the encryption and decryption methods, as shown below: + + ```java + public String encrypt(String content) { + return content + suffix; + } + + public String decrypt(String content) { + return content.substring(0, content.length() - suffix.length()); + } + ``` \ No newline at end of file diff --git a/docs/en/connector-v2/sink/CosFile.md b/docs/en/connector-v2/sink/CosFile.md index db11cfb9af8..2441306566a 100644 --- a/docs/en/connector-v2/sink/CosFile.md +++ b/docs/en/connector-v2/sink/CosFile.md @@ -63,6 +63,7 @@ By default, we use 2PC commit to ensure `exactly-once` | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | diff --git a/docs/en/connector-v2/sink/Doris.md b/docs/en/connector-v2/sink/Doris.md index dc177a3962b..a7e2fabfc65 100644 --- a/docs/en/connector-v2/sink/Doris.md +++ b/docs/en/connector-v2/sink/Doris.md @@ -47,7 +47,7 @@ The internal implementation of Doris sink connector is cached and imported by st | table | String | Yes | - | The table name of `Doris` table, use `${table_name}` to represent the upstream table name | | table.identifier | String | Yes | - | The name of `Doris` table, it will deprecate after version 2.3.5, please use `database` and `table` instead. | | sink.label-prefix | String | Yes | - | The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. | -| sink.enable-2pc | bool | No | false | Whether to enable two-phase commit (2pc), the default is false. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-statements/Data-Manipulation-Statements/Load/STREAM-LOAD/). | +| sink.enable-2pc | bool | No | false | Whether to enable two-phase commit (2pc), the default is false. For two-phase commit, please refer to [here](https://doris.apache.org/docs/data-operate/transaction?_highlight=two&_highlight=phase#stream-load-2pc). | | sink.enable-delete | bool | No | - | Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this [link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/) | | sink.check-interval | int | No | 10000 | check exception with the interval while loading | | sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | diff --git a/docs/en/connector-v2/sink/FtpFile.md b/docs/en/connector-v2/sink/FtpFile.md index 175d374d9aa..47811bdd791 100644 --- a/docs/en/connector-v2/sink/FtpFile.md +++ b/docs/en/connector-v2/sink/FtpFile.md @@ -62,6 +62,7 @@ By default, we use 2PC commit to ensure `exactly-once` | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | diff --git a/docs/en/connector-v2/sink/HdfsFile.md b/docs/en/connector-v2/sink/HdfsFile.md index 095c32eabc1..ae9479aa8fa 100644 --- a/docs/en/connector-v2/sink/HdfsFile.md +++ b/docs/en/connector-v2/sink/HdfsFile.md @@ -69,6 +69,7 @@ Output data to hdfs file | xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | diff --git a/docs/en/connector-v2/sink/LocalFile.md b/docs/en/connector-v2/sink/LocalFile.md index c48394f9175..9c2141b61f6 100644 --- a/docs/en/connector-v2/sink/LocalFile.md +++ b/docs/en/connector-v2/sink/LocalFile.md @@ -58,6 +58,7 @@ By default, we use 2PC commit to ensure `exactly-once` | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | diff --git a/docs/en/connector-v2/sink/ObsFile.md b/docs/en/connector-v2/sink/ObsFile.md index 560e7bfb35e..aa852c9b702 100644 --- a/docs/en/connector-v2/sink/ObsFile.md +++ b/docs/en/connector-v2/sink/ObsFile.md @@ -71,6 +71,7 @@ It only supports hadoop version **2.9.X+**. | is_enable_transaction | boolean | no | true | [Tips](#is_enable_transaction) | | batch_size | int | no | 1000000 | [Tips](#batch_size) | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | compress_codec | string | no | none | [Tips](#compress_codec) | | common-options | object | no | - | [Tips](#common_options) | | max_rows_in_memory | int | no | - | When File Format is Excel,The maximum number of data items that can be cached in the memory.Only used when file_format is excel. | diff --git a/docs/en/connector-v2/sink/OssFile.md b/docs/en/connector-v2/sink/OssFile.md index 52da0e83f56..55ef4f0935e 100644 --- a/docs/en/connector-v2/sink/OssFile.md +++ b/docs/en/connector-v2/sink/OssFile.md @@ -115,6 +115,7 @@ If write to `csv`, `text` file type, All column will be string. | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | diff --git a/docs/en/connector-v2/sink/OssJindoFile.md b/docs/en/connector-v2/sink/OssJindoFile.md index 1a95e81a44f..21fe05359e5 100644 --- a/docs/en/connector-v2/sink/OssJindoFile.md +++ b/docs/en/connector-v2/sink/OssJindoFile.md @@ -67,6 +67,7 @@ By default, we use 2PC commit to ensure `exactly-once` | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md index b5fb34e0311..b6fbc4ef4e4 100644 --- a/docs/en/connector-v2/sink/S3File.md +++ b/docs/en/connector-v2/sink/S3File.md @@ -123,6 +123,7 @@ If write to `csv`, `text` file type, All column will be string. | xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | diff --git a/docs/en/connector-v2/sink/SftpFile.md b/docs/en/connector-v2/sink/SftpFile.md index dbc8438ae26..4cde1eb866b 100644 --- a/docs/en/connector-v2/sink/SftpFile.md +++ b/docs/en/connector-v2/sink/SftpFile.md @@ -61,6 +61,7 @@ By default, we use 2PC commit to ensure `exactly-once` | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | +| create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | diff --git a/docs/en/connector-v2/source/Iceberg.md b/docs/en/connector-v2/source/Iceberg.md index 8bb21eb7b63..877be6f4d48 100644 --- a/docs/en/connector-v2/source/Iceberg.md +++ b/docs/en/connector-v2/source/Iceberg.md @@ -71,11 +71,12 @@ libfb303-xxx.jar ## Source Options -| Name | Type | Required | Default | Description | +| Name | Type | Required | Default | Description | |--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | yes | - | User-specified catalog name. | | namespace | string | yes | - | The iceberg database name in the backend catalog. | -| table | string | yes | - | The iceberg table name in the backend catalog. | +| table | string | no | - | The iceberg table name in the backend catalog. | +| table_list | string | no | - | The iceberg table list in the backend catalog. | | iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | | hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | | iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | @@ -87,6 +88,7 @@ libfb303-xxx.jar | use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. | | use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch | | stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are:
TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.
FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.
FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.
FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.
FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. | +| increment.scan-interval | long | no | 2000 | The interval of increment scan(mills) | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | ## Task Example @@ -101,25 +103,6 @@ env { source { Iceberg { - schema { - fields { - f2 = "boolean" - f1 = "bigint" - f3 = "int" - f4 = "bigint" - f5 = "float" - f6 = "double" - f7 = "date" - f9 = "timestamp" - f10 = "timestamp" - f11 = "string" - f12 = "bytes" - f13 = "bytes" - f14 = "decimal(19,9)" - f15 = "array" - f16 = "map" - } - } catalog_name = "seatunnel" iceberg.catalog.config={ type = "hadoop" @@ -141,6 +124,31 @@ sink { } ``` +### Multi-Table Read: + +```hocon +source { + Iceberg { + catalog_name = "seatunnel" + iceberg.catalog.config = { + type = "hadoop" + warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" + } + namespace = "database1" + table_list = [ + { + table = "table_1 + }, + { + table = "table_2 + } + ] + + plugin_output = "iceberg" + } +} +``` + ### Hadoop S3 Catalog: ```hocon diff --git a/docs/en/transform-v2/llm.md b/docs/en/transform-v2/llm.md index c1c9798abe3..680121cb4da 100644 --- a/docs/en/transform-v2/llm.md +++ b/docs/en/transform-v2/llm.md @@ -28,7 +28,7 @@ more. ### model_provider The model provider to use. The available options are: -OPENAI, DOUBAO, KIMIAI, MICROSOFT, CUSTOM +OPENAI, DOUBAO, DEEPSEEK, KIMIAI, MICROSOFT, CUSTOM > tips: If you use Microsoft, please make sure api_path cannot be empty diff --git a/docs/zh/connector-v2/Config-Encryption-Decryption.md b/docs/zh/connector-v2/Config-Encryption-Decryption.md index 9293d0e71e6..7664d792fd9 100644 --- a/docs/zh/connector-v2/Config-Encryption-Decryption.md +++ b/docs/zh/connector-v2/Config-Encryption-Decryption.md @@ -183,3 +183,43 @@ Base64编码支持加密以下参数: 5. 将其打成 jar 包, 并添加到 `${SEATUNNEL_HOME}/lib` 目录下。 6. 将选项 `shade.identifier` 的值更改为上面定义在配置文件中的 `ConfigShade#getIdentifier` 的值。 +### 在加密解密方法中使用自定义参数 + +如果您想要使用自定义参数进行加密和解密,可以按照以下步骤操作: +1. 在配置文件的env 中添加`shade.properties`配置,该配置的值是键值对形式(键的类型必须是字符串) ,如下所示: + + ```hocon + env { + shade.properties = { + suffix = "666" + } + } + + ``` +2. 覆写 `ConfigShade` 接口的 `open` 方法,如下所示: + + ```java + public static class ConfigShadeWithProps implements ConfigShade { + + private String suffix; + private String identifier = "withProps"; + + @Override + public void open(Map props) { + this.suffix = String.valueOf(props.get("suffix")); + } + } + ``` + 3. 在加密和解密方法中使用open 方法中传入的参数,如下所示: + + ```java + @Override + public String encrypt(String content) { + return content + suffix; + } + + @Override + public String decrypt(String content) { + return content.substring(0, content.length() - suffix.length()); + } + ``` \ No newline at end of file diff --git a/docs/zh/connector-v2/sink/Doris.md b/docs/zh/connector-v2/sink/Doris.md index 66fbe728ae5..f0504977aec 100644 --- a/docs/zh/connector-v2/sink/Doris.md +++ b/docs/zh/connector-v2/sink/Doris.md @@ -46,7 +46,7 @@ Doris Sink连接器的内部实现是通过stream load批量缓存和导入的 | table | String | Yes | - | `Doris` 表名, 使用 `${table_name}` 表示上游表名。 | | table.identifier | String | Yes | - | `Doris` 表的名称,2.3.5 版本后将弃用,请使用 `database` 和 `table` 代替。 | | sink.label-prefix | String | Yes | - | stream load导入使用的标签前缀。 在2pc场景下,需要全局唯一性来保证SeaTunnel的EOS语义。 | -| sink.enable-2pc | bool | No | false | 是否启用两阶段提交(2pc),默认为 false。 对于两阶段提交,请参考[此处](https://doris.apache.org/docs/dev/sql-manual/sql-statements/Data-Manipulation-Statements/Load/STREAM-LOAD/)。 | +| sink.enable-2pc | bool | No | false | 是否启用两阶段提交(2pc),默认为 false。 对于两阶段提交,请参考[此处](https://doris.apache.org/docs/data-operate/transaction?_highlight=two&_highlight=phase#stream-load-2pc)。 | | sink.enable-delete | bool | No | - | 是否启用删除。 该选项需要Doris表开启批量删除功能(0.15+版本默认开启),且仅支持Unique模型。 您可以在此[link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/)获得更多详细信息 | | sink.check-interval | int | No | 10000 | 加载过程中检查异常时间间隔。 | | sink.max-retries | int | No | 3 | 向数据库写入记录失败时的最大重试次数。 | diff --git a/docs/zh/connector-v2/sink/HdfsFile.md b/docs/zh/connector-v2/sink/HdfsFile.md index c0212ae4017..4561eb15721 100644 --- a/docs/zh/connector-v2/sink/HdfsFile.md +++ b/docs/zh/connector-v2/sink/HdfsFile.md @@ -56,6 +56,7 @@ | is_enable_transaction | boolean | 否 | true | 如果 `is_enable_transaction` 为 true,则在将数据写入目标目录时,我们将确保数据不会丢失或重复。请注意,如果 `is_enable_transaction` 为 `true`,我们将在文件头部自动添加 `${transactionId}_`。目前仅支持 `true`。 | | batch_size | int | 否 | 1000000 | 文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,则接收器写入器将在文件中写入行,直到文件中的行大于 `batch_size`。如果 `checkpoint.interval` 很小,则接收器写入器将在新检查点触发时创建一个新文件。 | | single_file_mode | boolean | 否 | false | 每个并行度只会输出一个文件,当此参数开启时,batch_size就不会生效。输出的文件名没有文件块后缀。 | +| create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,依然生成对应的数据文件。 | | compress_codec | string | 否 | none | 文件的压缩编解码器及其支持的细节如下所示:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`]。提示:excel类型不支持任何压缩格式。 | | krb5_path | string | 否 | /etc/krb5.conf | kerberos 的 krb5 路径 | | kerberos_principal | string | 否 | - | kerberos 的主体 | @@ -65,7 +66,7 @@ | enable_header_write | boolean | 否 | false | 仅在 file_format_type 为 text,csv 时使用。
false:不写入表头,true:写入表头。 | | max_rows_in_memory | int | 否 | - | 仅当 file_format 为 excel 时使用。当文件格式为 Excel 时,可以缓存在内存中的最大数据项数。 | | sheet_name | string | 否 | Sheet${Random number} | 仅当 file_format 为 excel 时使用。将工作簿的表写入指定的表名 | -| remote_user | string | 否 | - | Hdfs的远端用户名。 | +| remote_user | string | 否 | - | Hdfs的远端用户名。 | ### 提示 diff --git a/docs/zh/connector-v2/sink/LocalFile.md b/docs/zh/connector-v2/sink/LocalFile.md index dbd77384882..13cfd3cfbf8 100644 --- a/docs/zh/connector-v2/sink/LocalFile.md +++ b/docs/zh/connector-v2/sink/LocalFile.md @@ -50,6 +50,7 @@ | is_enable_transaction | boolean | 否 | true | 是否启用事务 | | batch_size | int | 否 | 1000000 | 批量大小 | | single_file_mode | boolean | 否 | false | 每个并行度只会输出一个文件,当此参数开启时,batch_size就不会生效。输出的文件名没有文件块后缀。 | +| create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,依然生成对应的数据文件。 | | compress_codec | string | 否 | none | 压缩编码 | | common-options | object | 否 | - | 常见选项 | | max_rows_in_memory | int | 否 | - | 仅在 file_format_type 为 excel 时使用 | diff --git a/docs/zh/connector-v2/sink/Neo4j.md b/docs/zh/connector-v2/sink/Neo4j.md new file mode 100644 index 00000000000..8efb97002b2 --- /dev/null +++ b/docs/zh/connector-v2/sink/Neo4j.md @@ -0,0 +1,144 @@ +# Neo4j + +> Neo4j 写连接器 + +## 描述 + +写数据到 `Neo4j`。 + +`neo4j-java-driver` version 4.4.9 + +## 主要功能 + +- [ ] [精确一次](../../concept/connector-v2-features.md) + +## 配置选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------------------|---------|------|----------| +| uri | String | 是 | - | +| username | String | 否 | - | +| password | String | 否 | - | +| max_batch_size | Integer | 否 | - | +| write_mode | String | 否 | OneByOne | +| bearer_token | String | 否 | - | +| kerberos_ticket | String | 否 | - | +| database | String | 是 | - | +| query | String | 是 | - | +| queryParamPosition | Object | 是 | - | +| max_transaction_retry_time | Long | 否 | 30 | +| max_connection_timeout | Long | 否 | 30 | +| common-options | config | 否 | - | + +### uri [string] + +`Neo4j`数据库的URI,参考配置: `neo4j://localhost:7687`。 + +### username [string] + +`Neo4j`用户名。 + +### password [string] + +`Neo4j`密码。如果提供了“用户名”,则需要。 + +### max_batch_size[Integer] + +`max_batch_size` 是指写入数据时,单个事务中可以写入的最大数据条目数。 + +### write_mode + +默认值为 `oneByOne` ,如果您想批量写入,请将其设置为`Batch` + +```cypher +unwind $ttt as row create (n:Label) set n.name = row.name,n.age = rw.age +``` + +`ttt`代表一批数据。,`ttt`可以是任意字符串,只要它与配置的`batch_data_variable` 匹配。 + +### bearer_token [string] + +`Neo4j`的`base64`编码`bearer token`用于鉴权。 + +### kerberos_ticket [string] + +`Neo4j`的`base64`编码`kerberos ticket`用于鉴权。 + +### database [string] + +数据库名称。 + +### query [string] + +查询语句。包含在运行时用相应值替换的参数占位符。 + +### queryParamPosition [object] + +查询参数的位置映射信息。 + +键名是参数占位符名称。 + +关联值是字段在输入数据行中的位置。 + +### max_transaction_retry_time [long] + +最大事务重试时间(秒)。如果超过,则交易失败。 + +### max_connection_timeout [long] + +等待TCP连接建立的最长时间(秒)。 + +### common options + +Sink插件常用参数, 详细信息请参考 [Sink公共配置](../sink-common-options.md) + +## OneByOne模式写示例 + +``` +sink { + Neo4j { + uri = "neo4j://localhost:7687" + username = "neo4j" + password = "1234" + database = "neo4j" + max_transaction_retry_time = 10 + max_connection_timeout = 10 + query = "CREATE (a:Person {name: $name, age: $age})" + queryParamPosition = { + name = 0 + age = 1 + } + } +} +``` + +## Batch模式写示例 +> cypher提供的`unwind`关键字支持批量写入, +> 批量数据的默认变量是batch。如果你写一个批处理写语句, +> 那么你应该声明 cypher `unwind $batch` 作为行 +``` +sink { + Neo4j { + uri = "bolt://localhost:7687" + username = "neo4j" + password = "neo4j" + database = "neo4j" + max_batch_size = 1000 + write_mode = "BATCH" + max_transaction_retry_time = 3 + max_connection_timeout = 10 + query = "unwind $batch as row create(n:MyLabel) set n.name = row.name,n.age = row.age" + } +} +``` + +## Changelog + +### 2.2.0-beta 2022-09-26 + +- 添加 Neo4j 写连接器 + +### issue ##4835 + +- 写连接器支持批量写入 + diff --git a/docs/zh/connector-v2/sink/PostgreSql.md b/docs/zh/connector-v2/sink/PostgreSql.md new file mode 100644 index 00000000000..7a053421701 --- /dev/null +++ b/docs/zh/connector-v2/sink/PostgreSql.md @@ -0,0 +1,270 @@ +# PostgreSql + +> JDBC PostgreSql 数据接收器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 描述 + +通过 JDBC 写入数据。支持批处理模式和流式模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 + +## 使用依赖 + +### 对于 Spark/Flink 引擎 + +> 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 + +### 对于 SeaTunnel Zeta 引擎 + +> 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 + +## 主要特性 + +- [x] [精确一次](../../concept/connector-v2-features.md) +- [x] [变更数据捕获(CDC)](../../concept/connector-v2-features.md) + +> 使用 `XA 事务` 来确保 `精确一次`。因此,仅对支持 `XA 事务` 的数据库支持 `精确一次`。您可以设置 `is_exactly_once=true` 来启用此功能。 + +## 支持的数据源信息 +| 数据源 | 支持的版本 | 驱动 | URL | Maven | +|--------------|-----------------------------------------------------|----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | 如果您想在 PostgreSQL 中处理 GEOMETRY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## 数据库依赖 + +> 请下载与 'Maven' 对应的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录中。
+> 例如 PostgreSQL 数据源:`cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/`
+> 如果您想在 PostgreSQL 中处理 GEOMETRY 类型,请将 `postgresql-xxx.jar` 和 `postgis-jdbc-xxx.jar` 添加到 `$SEATUNNEL_HOME/plugins/jdbc/lib/` 中。 + +## 数据类型映射 +| PostgreSQL 数据类型 | SeaTunnel 数据类型 | +|--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL
INT4
SERIAL
| INT | +| _INT2
_INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小,获取指定列小数点右侧的数字位数) | +| NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB
UUID | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP
| TIMESTAMP | +| TIME
| TIME | +| DATE
| DATE | +| 其他数据类型 | 目前不支持 | + +## 选项 + +| 名称 | 类型 | 必填 | 默认 | 描述 | +|-------------------------------------------|---------|------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | 是 | - | JDBC 连接的 URL。参见示例:jdbc:postgresql://localhost:5432/test
如果您使用 json 或 jsonb 类型插入,请添加 jdbc url 字符串 `stringtype=unspecified` 选项。 | +| driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,
如果使用 PostgreSQL,则该值为 `org.postgresql.Driver`。 | +| user | String | 否 | - | 连接实例的用户名。 | +| password | String | 否 | - | 连接实例的密码。 | +| query | String | 否 | - | 使用此 SQL 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 的优先级更高。 | +| database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成 SQL,并接收上游输入数据写入数据库。
此选项与 `query` 互斥,并具有更高的优先级。 | +| table | String | 否 | - | 使用数据库和此表名自动生成 SQL,并接收上游输入数据写入数据库。
此选项与 `query` 互斥,并具有更高的优先级。表参数可以填写一个不想的表的名称,最终将作为创建表的表名,并支持变量(`${table_name}`,`${schema_name}`)。替换规则: `${schema_name}` 将替换为传递给目标端的 SCHEMA 名称,`${table_name}` 将替换为传递给目标端的表名称。 | +| primary_keys | Array | 否 | - | 此选项用于支持在自动生成 SQL 时进行 `insert`,`delete` 和 `update` 操作。 | +| support_upsert_by_query_primary_key_exist | Boolean | 否 | false | 选择使用 INSERT SQL,UPDATE SQL 根据查询主键存在来处理更新事件(INSERT,UPDATE_AFTER)。此配置仅在数据库不支持 upsert 语法时使用。**注意**:此方法性能较低。 | +| connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接的数据库操作完成的等待时间(秒)。 | +| max_retries | Int | 否 | 0 | 提交失败的重试次数(executeBatch)。 | +| batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录的数量达到 `batch_size` 或时间达到 `checkpoint.interval`
时,数据将刷新到数据库。 | +| is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,将使用 XA 事务。如果启用,您需要
设置 `xa_data_source_class_name`。 | +| generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成 SQL 语句。 | +| xa_data_source_class_name | String | 否 | - | 数据库驱动的 XA 数据源类名,例如,PostgreSQL 是 `org.postgresql.xa.PGXADataSource`,并
请参阅附录以获取其他数据源。 | +| max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数。 | +| transaction_timeout_sec | Int | 否 | -1 | 事务开启后的超时时间,默认值为 -1(永不超时)。注意设置超时可能会影响
精确一次语义。 | +| auto_commit | Boolean | 否 | true | 默认启用自动事务提交。 | +| field_ide | String | 否 | - | 识别字段在从源到汇的同步时是否需要转换。`ORIGINAL` 表示无需转换;`UPPERCASE` 表示转换为大写;`LOWERCASE` 表示转换为小写。 | +| properties | Map | 否 | - | 附加连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
驱动的具体实现决定。例如,在 MySQL 中,properties 优先于 URL。 | +| common-options | | 否 | - | Sink 插件的公共参数,请参阅 [Sink 公共选项](../sink-common-options.md) 以获取详细信息。 | +| schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在同步任务开启之前,根据目标端现有表结构选择不同处理方案。 | +| data_save_mode | Enum | 否 | APPEND_DATA | 在同步任务开启之前,根据目标端现有数据选择不同处理方案。 | +| custom_sql | String | 否 | - | 当 `data_save_mode` 选择 `CUSTOM_PROCESSING` 时,您应该填写 `CUSTOM_SQL` 参数。此参数通常填入可执行的 SQL。SQL 将在同步任务之前执行。 | +| enable_upsert | Boolean | 否 | true | 通过主键存在启用 upsert,如果任务没有重复数据,设置此参数为 `false` 可以加快数据导入。 | + +### table [字符串] + +使用 `database` 和此 `table-name` 自动生成 SQL,并接收上游输入数据写入数据库。 + +此选项与 `query` 互斥,并具有更高的优先级。 + +表参数可以填写一个不想的表的名称,最终将作为创建表的表名,并支持变量(`${table_name}`,`${schema_name}`)。替换规则:`${schema_name}` 将替换为传递给目标端的 SCHEMA 名称,`${table_name}` 将替换为传递给目标端的表名称。 + +例如: +1. `${schema_name}.${table_name} _test` +2. `dbo.tt_${table_name} _sink` +3. `public.sink_table` + +### schema_save_mode [枚举] + +在同步任务开启之前,根据目标端现有表结构选择不同处理方案。 +选项介绍: +`RECREATE_SCHEMA` :当表不存在时将创建,保存时删除并重建。 +`CREATE_SCHEMA_WHEN_NOT_EXIST` :当表不存在时创建,保存时跳过。 +`ERROR_WHEN_SCHEMA_NOT_EXIST` :当表不存在时报告错误。 +`IGNORE` :忽略对表的处理。 + +### data_save_mode [枚举] + +在同步任务开启之前,根据目标端现有数据选择不同处理方案。 +选项介绍: +`DROP_DATA`:保留数据库结构并删除数据。 +`APPEND_DATA`:保留数据库结构,保留数据。 +`CUSTOM_PROCESSING`:用户定义处理。 +`ERROR_WHEN_DATA_EXISTS`:当存在数据时报告错误。 +### custom_sql [字符串] + +当 `data_save_mode` 选择 `CUSTOM_PROCESSING` 时,您应该填写 `CUSTOM_SQL` 参数。此参数通常填入可以执行的 SQL。SQL 将在同步任务之前执行。 + +### 提示 + +> 如果未设置 `partition_column`,它将以单线程并发运行;如果设置了 `partition_column`,它将根据任务的并发性并行执行。 + +## 任务示例 + +### 简单示例: + +> 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 JDBC Sink。FakeSource 生成总共 16 行数据(`row.num=16`),每行有两个字段,`name`(字符串类型)和 `age`(整数类型)。最终目标表 `test_table` 也将包含 16 行数据。在运行此作业之前,您需要在 PostgreSQL 中创建数据库 `test` 和表 `test_table`。如果您还未安装和部署 SeaTunnel,请按照 [安装 SeaTunnel](../../start-v2/locally/deployment.md) 中的说明进行安装和部署。然后按照 [快速开始 SeaTunnel 引擎](../../start-v2/locally/quick-start-seatunnel-engine.md) 中的说明运行此作业。 + +``` +# Defining the runtime environment +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + plugin_output = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/source +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink +} +``` + +### 生成 Sink SQL + + +> 此示例不需要编写复杂的 SQL 语句,您可以配置数据库名称和表名称,系统将自动为您生成添加语句。 + +``` +sink { + Jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = org.postgresql.Driver + user = root + password = 123456 + + generate_sink_sql = true + database = test + table = "public.test_table" + } +} +``` + +### 精确一次: + +> 对于精确写入场景,我们保证精确一次。 + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + + max_retries = 0 + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" + } +} +``` + +### CDC(变更数据捕获)事件 + +> 我们也支持 CDC 变更数据。在这种情况下,您需要配置数据库、表和主键。 + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + field_ide = UPPERCASE + } +} +``` + +### 保存模式功能 + +``` +sink { + Jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = org.postgresql.Driver + user = root + password = 123456 + + generate_sink_sql = true + database = test + table = "public.test_table" + schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" + data_save_mode="APPEND_DATA" + } +} +``` + diff --git a/docs/zh/connector-v2/source/Neo4j.md b/docs/zh/connector-v2/source/Neo4j.md new file mode 100644 index 00000000000..20471b7989d --- /dev/null +++ b/docs/zh/connector-v2/source/Neo4j.md @@ -0,0 +1,100 @@ +# Neo4j + +> Neo4j 源连接器器 + +## 描述 + +从 `Neo4j` 读取数据 + +`neo4j-java-driver` 版本 4.4.9 + +## 主要功能 + +- [x] [批处理](../../concept/connector-v2-features.md) +- [ ] [流处理](../../concept/connector-v2-features.md) +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [x] [列投影](../../concept/connector-v2-features.md) +- [ ] [并行度](../../concept/connector-v2-features.md) +- [ ] [支持用户定义拆分](../../concept/connector-v2-features.md) + +## 配置选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|----------------------------|--------|------|-----| +| uri | String | 是 | - | +| username | String | 否 | - | +| password | String | 否 | - | +| bearer_token | String | 否 | - | +| kerberos_ticket | String | 否 | - | +| database | String | 是 | - | +| query | String | 是 | - | +| schema | Object | 是 | - | +| max_transaction_retry_time | Long | 否 | 30 | +| max_connection_timeout | Long | 否 | 30 | + +### uri [string] + +`Neo4j`数据库的URI,参考配置: `neo4j://localhost:7687`。 + +### username [string] + +`Neo4j`用户名。 + +### password [string] + +`Neo4j`密码。如果提供了“用户名”,则需要。 + +### bearer_token [string] + +`Neo4j`的`base64`编码`bearer token`用于鉴权。 + +### kerberos_ticket [string] + +`Neo4j`的`base64`编码`kerberos ticket`用于鉴权。 + +### database [string] + +数据库名。 + +### query [string] + +查询语句。 + +### schema.fields [string] + +返回`query` 的字段。 + +查看 [列投影](../../concept/connector-v2-features.md) + +### max_transaction_retry_time [long] + +最大事务重试时间(秒)。如果超过,则事务失败。 + +### max_connection_timeout [long] + +等待TCP连接建立的最长时间(秒)。 + +## 示例 + +``` +source { + Neo4j { + uri = "neo4j://localhost:7687" + username = "neo4j" + password = "1234" + database = "neo4j" + max_transaction_retry_time = 1 + max_connection_timeout = 1 + query = "MATCH (a:Person) RETURN a.name, a.age" + schema { + fields { + a.age=INT + a.name=STRING + } + } + } +} +``` + + + diff --git a/docs/zh/connector-v2/source/PostgreSQL-CDC.md b/docs/zh/connector-v2/source/PostgreSQL-CDC.md new file mode 100644 index 00000000000..bf6c6117333 --- /dev/null +++ b/docs/zh/connector-v2/source/PostgreSQL-CDC.md @@ -0,0 +1,193 @@ +# PostgreSQL CDC + +> PostgreSQL CDC 源连接器 + +## 支持的引擎 + +> SeaTunnel Zeta
+> Flink
+ +## 主要特性 + +- [ ] [批处理](../../concept/connector-v2-features.md) +- [x] [流处理](../../concept/connector-v2-features.md) +- [x] [精确一次](../../concept/connector-v2-features.md) +- [ ] [列投影](../../concept/connector-v2-features.md) +- [x] [并行性](../../concept/connector-v2-features.md) +- [x] [支持用户定义的拆分](../../concept/connector-v2-features.md) + +## 描述 + +Postgre CDC 连接器允许从 Postgre 数据库读取快照数据和增量数据。本文件描述了如何设置 Postgre CDC 连接器,以便对 Postgre 数据库执行 SQL 查询。 + +## 支持的数据源信息 + +| 数据源 | 支持的版本 | 驱动 | Url | Maven | +|------------|-----------------------------------------------------|---------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | 如果您想在 PostgreSQL 中操作 GEOMETRY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## 使用依赖 + +### 安装 Jdbc 驱动 + +#### 对于 Spark/Flink 引擎 + +> 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 + +#### 对于 SeaTunnel Zeta 引擎 + +> 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 + +请下载并将 PostgreSQL 驱动放入 `${SEATUNNEL_HOME}/lib/` 目录。例如:cp postgresql-xxx.jar `$SEATUNNEL_HOME/lib/` + +> 以下是启用 PostgreSQL 中的 CDC(变化数据捕获)的步骤: + +1. 确保 wal_level 设置为 logical:通过在 postgresql.conf 配置文件中添加 "wal_level = logical" 来修改,重启 PostgreSQL 服务器以使更改生效。 + 或者,您可以使用 SQL 命令直接修改配置: + +```sql +ALTER SYSTEM SET wal_level TO 'logical'; +SELECT pg_reload_conf(); +``` + +2. 将指定表的 REPLICA 策略更改为 FULL + +```sql +ALTER TABLE your_table_name REPLICA IDENTITY FULL; +``` + +## 数据类型映射 + +| PostgreSQL 数据类型 | SeaTunnel 数据类型 | +|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL
INT4
SERIAL
| INT | +| _INT2
_INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小, 获取指定列小数点右侧的位数) | +| NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP
| TIMESTAMP | +| TIME
| TIME | +| DATE
| DATE | +| 其他数据类型 | 尚不支持 | + +## 源选项 + +| 名称 | 类型 | 必需 | 默认 | 描述 | +|------------------------------------------------|----------|------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| base-url | String | 是 | - | JDBC 连接的 URL。参考案例:`jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`。 | +| username | String | 是 | - | 连接到数据库服务器时使用的数据库名称。 | +| password | String | 是 | - | 连接到数据库服务器时使用的密码。 | +| database-names | List | 否 | - | 需要监控的数据库名称。 | +| table-names | List | 是 | - | 需要监控的数据库表名称。表名称需要包含数据库名称,例如:`database_name.table_name`。 | +| table-names-config | List | 否 | - | 表配置列表。例如: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | +| startup.mode | List | 否 | INITIAL | PostgreSQL CDC 消费者的可选启动模式,有效枚举为 `initial`、`earliest` 和 `latest`。
`initial`: 启动时同步历史数据,然后同步增量数据。
`earliest`: 从可能的最早偏移量启动。
`latest`: 从最新偏移量启动。 | +| snapshot.split.size | Integer | 否 | 8096 | 表快照的拆分大小(行数),捕获的表在读取表快照时被拆分成多个拆分。 | +| snapshot.fetch.size | Integer | 否 | 1024 | 读取表快照时每次轮询的最大获取大小。 | +| slot.name | String | 否 | - | 为特定数据库/模式创建的用于流式传输更改的 PostgreSQL 逻辑解码槽的名称。服务器使用此槽将事件流式传输到您正在配置的连接器。默认值为 seatunnel。 | +| decoding.plugin.name | String | 否 | pgoutput | 安装在服务器上的 Postgres 逻辑解码插件的名称,支持的值有 decoderbufs、wal2json、wal2json_rds、wal2json_streaming、wal2json_rds_streaming 和 pgoutput。 | +| server-time-zone | String | 否 | UTC | 数据库服务器中的会话时区。如果未设置,则使用 ZoneId.systemDefault() 来确定服务器时区。 | +| connect.timeout.ms | Duration | 否 | 30000 | 连接器在尝试连接到数据库服务器后应等待的最大时间,以防超时。 | +| connect.max-retries | Integer | 否 | 3 | 连接器应重试建立数据库服务器连接的最大重试次数。 | +| connection.pool.size | Integer | 否 | 20 | JDBC 连接池大小。 | +| chunk-key.even-distribution.factor.upper-bound | Double | 否 | 100 | 块键分布因子的上限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则将优化表块以实现均匀分布。否则,如果分布因子更大,则将认为该表分布不均匀,并且如果估计的分片数量超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 100.0。 | +| chunk-key.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 块键分布因子的下限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则将优化表块以实现均匀分布。否则,如果分布因子更小,则将认为该表分布不均匀,并且如果估计的分片数量超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 0.05。 | +| sample-sharding.threshold | Integer | 否 | 1000 | 此配置指定触发采样分片策略的估计分片数量阈值。当分布因子超出由 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,且估计的分片数量(计算为近似行数 / 块大小)超过此阈值时,将使用采样分片策略。这可以帮助更有效地处理大数据集。默认值为 1000 个分片。 | +| inverse-sampling.rate | Integer | 否 | 1000 | 在采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大数据集时,较低的采样率尤为有用。默认值为 1000。 | +| exactly_once | Boolean | 否 | false | 启用精确一次语义。 | +| format | Enum | 否 | DEFAULT | PostgreSQL CDC 的可选输出格式,有效枚举为 `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`。 | +| debezium | Config | 否 | - | 将 [Debezium 的属性](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) 传递给用于捕获 PostgreSQL 服务器数据更改的 Debezium 嵌入式引擎。 | +| common-options | | 否 | - | 源插件的公共参数,请参阅 [源公共选项](../source-common-options.md) 获取详细信息。 | + +## 任务示例 + +### 简单 + +> 支持多表读取 + +``` + + +env { + # You can set engine configuration here + execution.parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 + read_limit.bytes_per_second=7000000 + read_limit.rows_per_second=400 +} + +source { + Postgres-CDC { + plugin_output = "customers_Postgre_cdc" + username = "postgres" + password = "postgres" + database-names = ["postgres_cdc"] + schema-names = ["inventory"] + table-names = ["postgres_cdc.inventory.postgres_cdc_table_1,postgres_cdc.inventory.postgres_cdc_table_2"] + base-url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + } +} + +transform { + +} + +sink { + jdbc { + plugin_input = "customers_Postgre_cdc" + url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + driver = "org.postgresql.Driver" + user = "postgres" + password = "postgres" + + generate_sink_sql = true + # You need to configure both database and table + database = postgres_cdc + schema = "inventory" + tablePrefix = "sink_" + primary_keys = ["id"] + } +} +``` + +### 支持自定义表的主键 + +``` +source { + Postgres-CDC { + plugin_output = "customers_mysql_cdc" + username = "postgres" + password = "postgres" + database-names = ["postgres_cdc"] + schema-names = ["inventory"] + table-names = ["postgres_cdc.inventory.full_types_no_primary_key"] + base-url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" + decoding.plugin.name = "decoderbufs" + exactly_once = false + table-names-config = [ + { + table = "postgres_cdc.inventory.full_types_no_primary_key" + primaryKeys = ["id"] + } + ] + } +} +``` + +## 更新日志 + +- 添加 PostgreSQL CDC 源连接器 + +### 下一个版本 diff --git a/docs/zh/connector-v2/source/PostgreSQL.md b/docs/zh/connector-v2/source/PostgreSQL.md new file mode 100644 index 00000000000..1cd2f3de6a1 --- /dev/null +++ b/docs/zh/connector-v2/source/PostgreSQL.md @@ -0,0 +1,307 @@ +# PostgreSQL + +> JDBC PostgreSQL 源连接器 + +## 支持的引擎 + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## 使用依赖 + +### 对于 Spark/Flink 引擎 + +> 1. 您需要确保 [jdbc 驱动的jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 + +### 对于 SeaTunnel Zeta 引擎 + +> 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 + +## 主要特性 + +- [x] [批处理](../../concept/connector-v2-features.md) +- [ ] [流处理](../../concept/connector-v2-features.md) +- [x] [严格一次性](../../concept/connector-v2-features.md) +- [x] [列投影](../../concept/connector-v2-features.md) +- [x] [并行性](../../concept/connector-v2-features.md) +- [x] [支持用户定义的拆分](../../concept/connector-v2-features.md) + +> 支持查询 SQL,并可以实现投影效果。 + +## 描述 + +通过 JDBC 读取外部数据源数据。 + +## 支持的数据源信息 + +| 数据源 | 支持的版本 | 驱动 | URL | Maven | +|----------------|----------------------------------------------------|---------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | 如果您想在 PostgreSQL 中操作 GEOMETRY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## 数据库依赖 + +> 请下载与 'Maven' 对应的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录中
+> 例如,对于 PostgreSQL 数据源: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/
+> 如果您想在 PostgreSQL 中操作 GEOMETRY 类型,请将 postgresql-xxx.jar 和 postgis-jdbc-xxx.jar 添加到 $SEATUNNEL_HOME/plugins/jdbc/lib/ + +## 数据类型映射 + +| PostgreSQL 数据类型 | SeaTunnel 数据类型 | +|--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL | SMALLINT | +| _INT2 | ARRAY<SMALLINT> | +| INT4
SERIAL
| INT | +| _INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小,获取指定列小数点右侧的数字位数) | +| NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB
UUID | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP(s)
TIMESTAMPTZ(s) | TIMESTAMP(s) | +| TIME(s)
TIMETZ(s) | TIME(s) | +| DATE
| DATE | + +## 选项 + +| 名称 | 类型 | 必需 | 默认 | 描述 | +|----------------------------------------------|------------|------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:postgresql://localhost:5432/test | +| driver | String | 是 | - | 用于连接到远程数据源的 JDBC 类名,
如果您使用 MySQL,则值为 `com.mysql.cj.jdbc.Driver`。 | +| user | String | 否 | - | 连接实例的用户名 | +| password | String | 否 | - | 连接实例的密码 | +| query | String | 是 | - | 查询语句 | +| connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接的数据库操作完成的等待时间(秒) | +| partition_column | String | 否 | - | 用于并行化的分区列名,仅支持数字类型,
仅支持数字类型主键,并且只能配置一列。 | +| partition_lower_bound | BigDecimal | 否 | - | 扫描的 partition_column 的最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 | +| partition_upper_bound | BigDecimal | 否 | - | 扫描的 partition_column 的最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 | +| partition_num | Int | 否 | 作业并行性 | 分区数量,仅支持正整数。默认值为作业并行性 | +| fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置
用于查询的行抓取大小,以通过减少所需的数据库访问次数来提高性能。
0 表示使用 JDBC 默认值。 | +| properties | Map | 否 | - | 其他连接配置参数,当属性和 URL 具有相同参数时,
优先级由驱动程序的具体实现决定。在 MySQL 中,属性优先于 URL。 | +| table_path | String | 否 | - | 表的完整路径,您可以使用此配置替代 `query`。
示例:
mysql: "testdb.table1"
oracle: "test_schema.table1"
sqlserver: "testdb.test_schema.table1"
postgresql: "testdb.test_schema.table1" | +| table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置替代 `table_path` 示例:```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | +| where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。 例如 `where id > 100` | +| split.size | Int | 否 | 8096 | 表的拆分大小(行数),被捕获的表在读取时被拆分为多个拆分。 | +| split.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 块键分布因子的下限。此因子用于确定表数据是否均匀分布。
如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较小,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 0.05。 | +| split.even-distribution.factor.upper-bound | Double | 否 | 100 | 块键分布因子的上限。此因子用于确定表数据是否均匀分布。
如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较大,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 100.0。 | +| split.sample-sharding.threshold | Int | 否 | 10000 | 此配置指定触发样本分片策略的估计分片数阈值。
当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围时,且估计的分片数(计算为近似行数 / 块大小)超过此阈值,将使用样本分片策略。这可以帮助更高效地处理大数据集。默认值为 1000 个分片。 | +| split.inverse-sampling.rate | Int | 否 | 1000 | 在样本分片策略中使用的采样率的逆数。例如,如果此值设置为 1000,表示在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大的数据集时,较低的采样率尤其有用。默认值为 1000。 | +| +## 并行读取器 + +JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用某些规则来拆分表中的数据,这些数据将交给读取器进行读取。读取器的数量由 `parallelism` 选项确定。 + +**拆分键规则:** + +1. 如果 `partition_column` 不为 null,将用于计算拆分。该列必须属于 **支持的拆分数据类型**。 +2. 如果 `partition_column` 为 null,SeaTunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多列,则使用第一个属于 **支持的拆分数据类型** 的列来拆分数据。例如,表有主键(nn guid, name varchar),因为 `guid` 不在 **支持的拆分数据类型** 中,因此将使用列 `name` 来拆分数据。 + +**支持的拆分数据类型:** +* 字符串 +* 数字(int, bigint, decimal, ...) +* 日期 + +### 与拆分相关的选项 + +#### split.size + +每个拆分中有多少行,当读取表时,被捕获的表将拆分为多个拆分。 + +#### split.even-distribution.factor.lower-bound + +> 不推荐使用 + +块键分布因子的下限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较小,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 0.05。 + +#### split.even-distribution.factor.upper-bound + +> 不推荐使用 + +块键分布因子的上限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较大,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 100.0。 + +#### split.sample-sharding.threshold + +此配置指定触发样本分片策略的估计分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围时,且估计的分片数(计算为近似行数 / 块大小)超过此阈值,将使用样本分片策略。这可以帮助更高效地处理大数据集。默认值为 1000 个分片。 + +#### split.inverse-sampling.rate + +在样本分片策略中使用的采样率的逆数。例如,如果此值设置为 1000,表示在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大的数据集时,较低的采样率尤其有用。默认值为 1000。 + +#### partition_column [字符串] + +用于拆分数据的列名。 + +#### partition_upper_bound [BigDecimal] + +扫描的 partition_column 最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 + +#### partition_lower_bound [BigDecimal] + +扫描的 partition_column 最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 + +#### partition_num [整数] + +> 不推荐使用,正确的方法是通过 `split.size` 控制拆分数量 + +我们需要拆分成多少个拆分,仅支持正整数。默认值为作业并行性。 + +## 提示 + +> 如果表无法拆分(例如,表没有主键或唯一索引,并且未设置 `partition_column`),将以单一并发运行。 +> +> 使用 `table_path` 替代 `query` 进行单表读取。如果需要读取多个表,请使用 `table_list`。 + +## 任务示例 + +### 简单示例: + +> 此示例查询您测试 "database" 中 type_bin 为 'table' 的 16 条数据,并以单并行方式查询其所有字段。您还可以指定要查询的字段,以便最终输出到控制台。 + +``` +# Defining the runtime environment +env { + parallelism = 4 + job.mode = "BATCH" +} + +source{ + Jdbc { + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source limit 16" + } +} + +transform { + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### 按 partition_column 并行读取 + +> 使用您配置的分片字段和分片数据并行读取查询表。如果您想要读取整个表,可以这样做。 + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + partition_num = 5 + } +} +sink { + Console {} +} +``` + +### 按主键或唯一索引并行读取 + +> 配置 `table_path` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略。 + +``` +env { + parallelism = 4 + job.mode = "BATCH" +} +source { + Jdbc { + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + table_path = "test.public.AllDataType_1" + query = "select * from public.AllDataType_1" + split.size = 10000 + } +} + +sink { + Console {} +} +``` + +### 并行边界: + +> 在查询中指定上下边界内的数据更为高效。根据您配置的上下边界读取数据源将更为高效。 + +``` +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + + # The name of the table returned + plugin_output = "jdbc" + partition_lower_bound = 1 + partition_upper_bound = 50 + partition_num = 5 + } +} +``` + +### 多表读取: + +***配置 `table_list` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略*** + +```hocon +env { + job.mode = "BATCH" + parallelism = 4 +} +source { + Jdbc { + url="jdbc:postgresql://datasource01:5432/demo" + user="iDm82k6Q0Tq+wUprWnPsLQ==" + driver="org.postgresql.Driver" + password="iDm82k6Q0Tq+wUprWnPsLQ==" + "table_list"=[ + { + "table_path"="demo.public.AllDataType_1" + }, + { + "table_path"="demo.public.alldatatype" + } + ] + #where_condition= "where id > 100" + split.size = 10000 + #split.even-distribution.factor.upper-bound = 100 + #split.even-distribution.factor.lower-bound = 0.05 + #split.sample-sharding.threshold = 1000 + #split.inverse-sampling.rate = 1000 + } +} + +sink { + Console {} +} +``` + diff --git a/docs/zh/connector-v2/source/StarRocks.md b/docs/zh/connector-v2/source/StarRocks.md new file mode 100644 index 00000000000..773718648ab --- /dev/null +++ b/docs/zh/connector-v2/source/StarRocks.md @@ -0,0 +1,183 @@ +# StarRocks + +> StarRocks 源连接器 + +## 描述 + +通过`StarRocks`读取外部数据源数据。 +`StarRocks`源连接器的内部实现是从`FE`获取查询计划, +将查询计划作为参数传递给`BE`节点,然后从`BE`节点获取数据结果。 + +## 主要功能 + +- [x] [批处理](../../concept/connector-v2-features.md) +- [ ] [流处理](../../concept/connector-v2-features.md) +- [ ] [精确一次](../../concept/connector-v2-features.md) +- [x] [列投影](../../concept/connector-v2-features.md) +- [x] [并行度](../../concept/connector-v2-features.md) +- [x] [支持用户定义拆分](../../concept/connector-v2-features.md) + +## 配置选项 + +| 名称 | 类型 | 是否必须 | 默认值 | +|-------------------------|--------|------|-------------------| +| nodeUrls | list | 是 | - | +| username | string | 是 | - | +| password | string | 是 | - | +| database | string | 是 | - | +| table | string | 是 | - | +| scan_filter | string | 否 | - | +| schema | config | 是 | - | +| request_tablet_size | int | 否 | Integer.MAX_VALUE | +| scan_connect_timeout_ms | int | 否 | 30000 | +| scan_query_timeout_sec | int | 否 | 3600 | +| scan_keep_alive_min | int | 否 | 10 | +| scan_batch_rows | int | 否 | 1024 | +| scan_mem_limit | long | 否 | 2147483648 | +| max_retries | int | 否 | 3 | +| scan.params.* | string | 否 | - | + +### nodeUrls [list] + +`StarRocks` 集群地址配置格式 `["fe_ip:fe_http_port", ...]`。 + +### username [string] + +`StarRocks` 用户名称。 + +### password [string] + +`StarRocks` 用户密码。 + +### database [string] + +`StarRocks` 数据库名。 + +### table [string] + +`StarRocks` 表名。 + +### scan_filter [string] + +过滤查询的表达式,该表达式透明地传输到`StarRocks` 。`StarRocks` 使用此表达式完成源端数据过滤。 + +例如 + +``` +"tinyint_1 = 100" +``` + +### schema [config] + +#### fields [Config] + +要生成的`starRocks`的`schema` + +示例 + +``` +schema { + fields { + name = string + age = int + } + } +``` + +### request_tablet_size [int] + +与分区对应的`StarRocks tablet`的数量。此值设置得越小,生成的分区就越多。这将增加引擎的平行度,但同时也会给`StarRocks`造成更大的压力。 + +以下示例,用于解释如何使用`request_tablet_size`来控制分区的生成。 + +``` +StarRocks 集群中表的 tablet 分布作为 follower + +be_node_1 tablet[1, 2, 3, 4, 5] +be_node_2 tablet[6, 7, 8, 9, 10] +be_node_3 tablet[11, 12, 13, 14, 15] + +1.如果没有设置 request_tablet_size,则单个分区中的 tablet 数量将没有限制。分区将按以下方式生成: + +partition[0] 从 be_node_1 读取 tablet 数据:tablet[1, 2, 3, 4, 5] +partition[1] 从 be_node_2 读取 tablet 数据:tablet[6, 7, 8, 9, 10] +partition[2] 从 be_node_3 读取 tablet 数据:tablet[11, 12, 13, 14, 15] + +2.如果设置了 request_tablet_size=3,则每个分区中最多包含 3 个 tablet。分区将按以下方式生成 + +partition[0] 从 be_node_1 读取 tablet 数据:tablet[1, 2, 3] +partition[1] 从 be_node_1 读取 tablet 数据:tablet[4, 5] +partition[2] 从 be_node_2 读取 tablet 数据:tablet[6, 7, 8] +partition[3] 从 be_node_2 读取 tablet 数据:tablet[9, 10] +partition[4] 从 be_node_3 读取 tablet 数据:tablet[11, 12, 13] +partition[5] 从 be_node_3 读取 tablet 数据:tablet[14,15] +``` + +### scan_connect_timeout_ms [int] + +发送到 `StarRocks` 的请求连接超时。 + +### scan_query_timeout_sec [int] + +在 `StarRocks` 中,查询超时时间的默认值为 1 小时,-1 表示没有超时限制。 + +### scan_keep_alive_min [int] + +查询任务的保持连接时长,单位是分钟,默认值为 10 分钟。我们建议将此参数设置为大于或等于 5 的值。 +### scan_batch_rows [int] + +一次从 `BE` 节点读取的最大数据行数。增加此值可以减少引擎与 `StarRocks` 之间建立的连接数量,从而减轻由网络延迟引起的开销。 +### scan_mem_limit [long] + +单个查询在 BE 节点上允许的最大内存空间,单位为字节,默认值为 2147483648 字节(即 2 GB)。 + +### max_retries [int] + +发送到 `StarRocks` 的重试请求次数。 + +### scan.params. [string] + +从 `BE` 节点扫描数据相关的参数。 + +## 示例 + +``` +source { + StarRocks { + nodeUrls = ["starrocks_e2e:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_source" + scan_batch_rows = 10 + max_retries = 3 + schema { + fields { + BIGINT_COL = BIGINT + LARGEINT_COL = STRING + SMALLINT_COL = SMALLINT + TINYINT_COL = TINYINT + BOOLEAN_COL = BOOLEAN + DECIMAL_COL = "DECIMAL(20, 1)" + DOUBLE_COL = DOUBLE + FLOAT_COL = FLOAT + INT_COL = INT + CHAR_COL = STRING + VARCHAR_11_COL = STRING + STRING_COL = STRING + DATETIME_COL = TIMESTAMP + DATE_COL = DATE + } + } + scan.params.scanner_thread_pool_thread_num = "3" + + } +} +``` + +## Changelog + +### next version + +- Add StarRocks Source Connector + diff --git a/docs/zh/transform-v2/llm.md b/docs/zh/transform-v2/llm.md index 7b505bde243..c1d05d59a34 100644 --- a/docs/zh/transform-v2/llm.md +++ b/docs/zh/transform-v2/llm.md @@ -26,7 +26,7 @@ ### model_provider 要使用的模型提供者。可用选项为: -OPENAI、DOUBAO、KIMIAI、MICROSOFT, CUSTOM +OPENAI,DOUBAO,DEEPSEEK,KIMIAI,MICROSOFT, CUSTOM > tips: 如果使用 Microsoft, 请确保 api_path 配置不能为空 diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigShade.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigShade.java index 5532f48e064..d7a8a2f3aa8 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigShade.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigShade.java @@ -17,6 +17,8 @@ package org.apache.seatunnel.api.configuration; +import java.util.Map; + /** * The interface that provides the ability to encrypt and decrypt {@link * org.apache.seatunnel.shade.com.typesafe.config.Config} @@ -47,4 +49,15 @@ public interface ConfigShade { default String[] sensitiveOptions() { return new String[0]; } + + /** + * this method will be called before the encrypt/decrpyt method. Users can use the props to + * control the behavior of the encrypt/decrypt + * + * @param props the additional properties defined with the key `shade.props` in the + * configuration + */ + default void open(Map props) { + // default do nothing + } } diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/ArrayType.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/ArrayType.java index 36c3362108a..65f7651e790 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/ArrayType.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/ArrayType.java @@ -56,7 +56,7 @@ public class ArrayType implements SeaTunnelDataType { private final Class arrayClass; private final SeaTunnelDataType elementType; - protected ArrayType(Class arrayClass, SeaTunnelDataType elementType) { + public ArrayType(Class arrayClass, SeaTunnelDataType elementType) { this.arrayClass = arrayClass; this.elementType = elementType; } diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRow.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRow.java index 39f61aee5d0..091284b7e3a 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRow.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRow.java @@ -169,7 +169,6 @@ private int getBytesForValue(Object v, SeaTunnelDataType dataType) { if (elementType instanceof DecimalType) { return ((Object[]) v).length * 36; } - if (elementType instanceof LocalTimeType) { SqlType eleSqlType = elementType.getSqlType(); switch (eleSqlType) { @@ -232,6 +231,8 @@ private int getBytesForArray(Object v, SeaTunnelDataType dataType) { return getArrayNotNullSize((Long[]) v) * 8; case DOUBLE: return getArrayNotNullSize((Double[]) v) * 8; + case MAP: + return getArrayMapNotNullSize(v); case NULL: default: return 0; @@ -248,6 +249,19 @@ private int getArrayNotNullSize(Object[] values) { return c; } + private int getArrayMapNotNullSize(Object v) { + int size = 0; + if (Objects.nonNull(v)) { + for (Map o : (Map[]) v) { + for (Map.Entry entry : ((Map) o).entrySet()) { + size += getBytesForValue(entry.getKey()) + getBytesForValue(entry.getValue()); + } + } + } + + return size; + } + public int getBytesSize() { if (size == 0) { int s = 0; @@ -305,6 +319,9 @@ private int getBytesForValue(Object v) { return getBytesForArray(v, BasicType.FLOAT_TYPE); case "Double[]": return getBytesForArray(v, BasicType.DOUBLE_TYPE); + case "Map[]": + return getBytesForArray( + v, new MapType<>(BasicType.STRING_TYPE, BasicType.INT_TYPE)); case "HashMap": case "LinkedHashMap": int size = 0; diff --git a/seatunnel-api/src/test/java/org/apache/seatunnel/api/table/type/SeaTunnelRowTest.java b/seatunnel-api/src/test/java/org/apache/seatunnel/api/table/type/SeaTunnelRowTest.java index 448185b8cb2..ba1745c5ffd 100644 --- a/seatunnel-api/src/test/java/org/apache/seatunnel/api/table/type/SeaTunnelRowTest.java +++ b/seatunnel-api/src/test/java/org/apache/seatunnel/api/table/type/SeaTunnelRowTest.java @@ -17,6 +17,8 @@ package org.apache.seatunnel.api.table.type; +import org.apache.seatunnel.shade.com.google.common.collect.Maps; + import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -43,6 +45,9 @@ void testForRowSize() { new Object[] { 1, "test", 1L, new BigDecimal("3333.333"), })); + + Map objectMap = Maps.newHashMap(); + objectMap.put("name", "cosmos"); SeaTunnelRow row = new SeaTunnelRow( new Object[] { @@ -58,7 +63,8 @@ void testForRowSize() { new Float[] {1F, 2F}, new Boolean[] {Boolean.TRUE, Boolean.FALSE}, new Byte[] {1, 2, 3, 4}, - new Short[] {Short.parseShort("1")} + new Short[] {Short.parseShort("1")}, + new Map[] {objectMap} }); SeaTunnelRow row2 = @@ -76,14 +82,15 @@ void testForRowSize() { new Float[] {1F, 2F, null}, new Boolean[] {Boolean.TRUE, Boolean.FALSE, null}, new Byte[] {1, 2, 3, 4, null}, - new Short[] {Short.parseShort("1"), null} + new Short[] {Short.parseShort("1"), null}, + new Map[] {objectMap} }); SeaTunnelRowType rowType = new SeaTunnelRowType( new String[] { "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", - "f11", "f12" + "f11", "f12", "f13" }, new SeaTunnelDataType[] { BasicType.INT_TYPE, @@ -107,14 +114,17 @@ void testForRowSize() { ArrayType.FLOAT_ARRAY_TYPE, ArrayType.BOOLEAN_ARRAY_TYPE, ArrayType.BYTE_ARRAY_TYPE, - ArrayType.SHORT_ARRAY_TYPE + ArrayType.SHORT_ARRAY_TYPE, + new ArrayType<>( + Map[].class, + new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE)) }); - Assertions.assertEquals(249, row.getBytesSize(rowType)); - Assertions.assertEquals(249, row.getBytesSize()); + Assertions.assertEquals(259, row.getBytesSize(rowType)); + Assertions.assertEquals(259, row.getBytesSize()); - Assertions.assertEquals(249, row2.getBytesSize(rowType)); - Assertions.assertEquals(249, row2.getBytesSize()); + Assertions.assertEquals(259, row2.getBytesSize(rowType)); + Assertions.assertEquals(259, row2.getBytesSize()); } @Test diff --git a/seatunnel-common/src/main/java/org/apache/seatunnel/common/config/TypesafeConfigUtils.java b/seatunnel-common/src/main/java/org/apache/seatunnel/common/config/TypesafeConfigUtils.java index d80273ece0e..6f001a3da6d 100644 --- a/seatunnel-common/src/main/java/org/apache/seatunnel/common/config/TypesafeConfigUtils.java +++ b/seatunnel-common/src/main/java/org/apache/seatunnel/common/config/TypesafeConfigUtils.java @@ -77,6 +77,9 @@ public static T getConfig( ? (T) Boolean.valueOf(config.getString(configKey)) : defaultValue; } + if (defaultValue instanceof Map) { + return config.hasPath(configKey) ? (T) config.getAnyRef(configKey) : defaultValue; + } throw new RuntimeException("Unsupported config type, configKey: " + configKey); } diff --git a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/JsonUtils.java b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/JsonUtils.java index 0ca64515044..5d765583ba8 100644 --- a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/JsonUtils.java +++ b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/JsonUtils.java @@ -303,4 +303,13 @@ public String deserialize(JsonParser p, DeserializationContext ctxt) throws IOEx } } } + + public static boolean isJsonArray(String jsonString) { + try { + JsonNode jsonNode = OBJECT_MAPPER.readTree(jsonString); + return jsonNode.isArray(); + } catch (Exception e) { + return false; + } + } } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java index 89b9c50c30d..227d2b7eeef 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java @@ -17,6 +17,9 @@ package org.apache.seatunnel.connectors.cdc.debezium.row; +import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; + +import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; @@ -48,6 +51,7 @@ import java.time.LocalTime; import java.time.ZoneId; import java.util.Arrays; +import java.util.List; import java.util.Optional; /** Deserialization schema from Debezium object to {@link SeaTunnelRow} */ @@ -173,12 +177,49 @@ public Object convert(Object dbzObj, Schema schema) throws Exception { return createRowConverter( (SeaTunnelRowType) type, serverTimeZone, userDefinedConverterFactory); case ARRAY: + return createArrayConverter(type); case MAP: default: throw new UnsupportedOperationException("Unsupported type: " + type); } } + @VisibleForTesting + protected static DebeziumDeserializationConverter createArrayConverter( + SeaTunnelDataType type) { + SeaTunnelDataType elementType = ((ArrayType) type).getElementType(); + switch (elementType.getSqlType()) { + case BOOLEAN: + return (dbzObj, schema) -> + convertListToArray((List) dbzObj, Boolean.class); + case SMALLINT: + return (dbzObj, schema) -> convertListToArray((List) dbzObj, Short.class); + case INT: + return (dbzObj, schema) -> + convertListToArray((List) dbzObj, Integer.class); + case BIGINT: + return (dbzObj, schema) -> convertListToArray((List) dbzObj, Long.class); + case FLOAT: + return (dbzObj, schema) -> convertListToArray((List) dbzObj, Float.class); + case DOUBLE: + return (dbzObj, schema) -> convertListToArray((List) dbzObj, Double.class); + case STRING: + return (dbzObj, schema) -> convertListToArray((List) dbzObj, String.class); + default: + throw new IllegalArgumentException( + "Unsupported SQL type: " + elementType.getSqlType()); + } + } + + @SuppressWarnings("unchecked") + private static T[] convertListToArray(List list, Class clazz) { + T[] array = (T[]) java.lang.reflect.Array.newInstance(clazz, list.size()); + for (int i = 0; i < list.size(); i++) { + array[i] = list.get(i); + } + return array; + } + private static DebeziumDeserializationConverter convertToBoolean() { return new DebeziumDeserializationConverter() { private static final long serialVersionUID = 1L; diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/test/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConvertersTest.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/test/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConvertersTest.java index 74e832d6e0f..14098cecc91 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/test/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConvertersTest.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/test/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConvertersTest.java @@ -17,10 +17,12 @@ package org.apache.seatunnel.connectors.cdc.debezium.row; +import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationConverter; import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationConverterFactory; import org.apache.seatunnel.connectors.cdc.debezium.MetadataConverter; @@ -34,6 +36,7 @@ import java.time.ZoneId; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; public class SeaTunnelRowDebeziumDeserializationConvertersTest { @@ -75,4 +78,53 @@ void testDefaultValueNotUsed() throws Exception { Assertions.assertEquals(row.getField(0), 1); Assertions.assertNull(row.getField(1)); } + + @Test + void testArrayConverter() throws Exception { + DebeziumDeserializationConverter converter; + // bool array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.BOOLEAN_ARRAY_TYPE); + Boolean[] booleans = new Boolean[] {false, true}; + Assertions.assertTrue( + Arrays.equals( + booleans, (Boolean[]) (converter.convert(Arrays.asList(booleans), null)))); + // smallInt array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.SHORT_ARRAY_TYPE); + Short[] shorts = new Short[] {(short) 1, (short) 2}; + Assertions.assertTrue( + Arrays.equals(shorts, (Short[]) (converter.convert(Arrays.asList(shorts), null)))); + // int array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.INT_ARRAY_TYPE); + Integer[] ints = new Integer[] {1, 2}; + Assertions.assertTrue( + Arrays.equals(ints, (Integer[]) (converter.convert(Arrays.asList(ints), null)))); + // long array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.LONG_ARRAY_TYPE); + Long[] longs = new Long[] {1L, 2L}; + Assertions.assertTrue( + Arrays.equals(longs, (Long[]) (converter.convert(Arrays.asList(longs), null)))); + // float array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.FLOAT_ARRAY_TYPE); + Float[] floats = new Float[] {1.0f, 2.0f}; + Assertions.assertTrue( + Arrays.equals(floats, (Float[]) (converter.convert(Arrays.asList(floats), null)))); + // double array converter + converter = + SeaTunnelRowDebeziumDeserializationConverters.createArrayConverter( + ArrayType.DOUBLE_ARRAY_TYPE); + Double[] doubles = new Double[] {1.0, 2.0}; + Assertions.assertTrue( + Arrays.equals( + doubles, (Double[]) (converter.convert(Arrays.asList(doubles), null)))); + } } diff --git a/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/catalog/ElasticSearchTypeConverter.java b/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/catalog/ElasticSearchTypeConverter.java index 412342cb828..d92d0839988 100644 --- a/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/catalog/ElasticSearchTypeConverter.java +++ b/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/catalog/ElasticSearchTypeConverter.java @@ -64,7 +64,6 @@ import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.LONG; import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.LONG_RANGE; import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.MATCH_ONLY_TEXT; -import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.NESTED; import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.OBJECT; import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.PERCOLATOR; import static org.apache.seatunnel.connectors.seatunnel.elasticsearch.client.EsType.POINT; @@ -150,6 +149,12 @@ public Column convert(BasicTypeDefine typeDefine) { }); builder.dataType(rowType); break; + case EsType.NESTED: + builder.dataType( + new ArrayType<>( + Map[].class, + new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE))); + break; case INTEGER: case TOKEN_COUNT: builder.dataType(BasicType.INT_TYPE); @@ -207,7 +212,6 @@ public Column convert(BasicTypeDefine typeDefine) { case COMPLETION: case STRING: case GEO_SHAPE: - case NESTED: case PERCOLATOR: case POINT: case RANK_FEATURES: diff --git a/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/serialize/source/DefaultSeaTunnelRowDeserializer.java b/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/serialize/source/DefaultSeaTunnelRowDeserializer.java index fd176f2f034..e1d8ca1da45 100644 --- a/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/serialize/source/DefaultSeaTunnelRowDeserializer.java +++ b/seatunnel-connectors-v2/connector-elasticsearch/src/main/java/org/apache/seatunnel/connectors/seatunnel/elasticsearch/serialize/source/DefaultSeaTunnelRowDeserializer.java @@ -42,10 +42,13 @@ import java.time.LocalDateTime; import java.time.ZoneId; import java.time.format.DateTimeFormatter; +import java.util.ArrayList; import java.util.Base64; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.seatunnel.api.table.type.BasicType.BOOLEAN_TYPE; import static org.apache.seatunnel.api.table.type.BasicType.BYTE_TYPE; @@ -177,7 +180,17 @@ Object convertValue(SeaTunnelDataType fieldType, String fieldValue) } else if (fieldType instanceof ArrayType) { ArrayType arrayType = (ArrayType) fieldType; SeaTunnelDataType elementType = arrayType.getElementType(); - List stringList = JsonUtils.toList(fieldValue, String.class); + List stringList = new ArrayList<>(); + if (elementType instanceof MapType) { + stringList = + JsonUtils.isJsonArray(fieldValue) + ? JsonUtils.toList(fieldValue, Map.class).stream() + .map(JsonUtils::toJsonString) + .collect(Collectors.toList()) + : Collections.singletonList(fieldValue); + } else { + stringList = JsonUtils.toList(fieldValue, String.class); + } Object arr = Array.newInstance(elementType.getTypeClass(), stringList.size()); for (int i = 0; i < stringList.size(); i++) { Object convertValue = convertValue(elementType, stringList.get(i)); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseFileSinkConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseFileSinkConfig.java index 2957f451b4d..bebf4fdbae0 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseFileSinkConfig.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseFileSinkConfig.java @@ -44,6 +44,8 @@ public class BaseFileSinkConfig implements DelimiterConfig, Serializable { protected String path; protected String fileNameExpression = BaseSinkConfig.FILE_NAME_EXPRESSION.defaultValue(); protected boolean singleFileMode = BaseSinkConfig.SINGLE_FILE_MODE.defaultValue(); + protected boolean createEmptyFileWhenNoData = + BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA.defaultValue(); protected FileFormat fileFormat = FileFormat.TEXT; protected DateUtils.Formatter dateFormat = DateUtils.Formatter.YYYY_MM_DD; protected DateTimeUtils.Formatter datetimeFormat = DateTimeUtils.Formatter.YYYY_MM_DD_HH_MM_SS; @@ -87,6 +89,11 @@ public BaseFileSinkConfig(@NonNull Config config) { this.singleFileMode = config.getBoolean(BaseSinkConfig.SINGLE_FILE_MODE.key()); } + if (config.hasPath(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA.key())) { + this.createEmptyFileWhenNoData = + config.getBoolean(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA.key()); + } + if (config.hasPath(BaseSinkConfig.FILE_FORMAT_TYPE.key()) && !StringUtils.isBlank(config.getString(BaseSinkConfig.FILE_FORMAT_TYPE.key()))) { this.fileFormat = diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSinkConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSinkConfig.java index 88be37bc106..d2d3c4d0cd9 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSinkConfig.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSinkConfig.java @@ -194,6 +194,13 @@ public class BaseSinkConfig extends KerberosConfig { .withDescription( "Whether to write all data to a single file in each parallelism task"); + public static final Option CREATE_EMPTY_FILE_WHEN_NO_DATA = + Options.key("create_empty_file_when_no_data") + .booleanType() + .defaultValue(false) + .withDescription( + "Whether to generate an empty file when there is no data to write"); + public static final Option FILENAME_TIME_FORMAT = Options.key("filename_time_format") .stringType() diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/BaseFileSink.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/BaseFileSink.java index a02e41c87b1..09bc3ea2857 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/BaseFileSink.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/BaseFileSink.java @@ -60,6 +60,12 @@ public void preCheckConfig() { throw new IllegalArgumentException( "Single file mode is not supported when checkpoint is enabled or in streaming mode."); } + if (pluginConfig.hasPath(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA.key()) + && pluginConfig.getBoolean(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA.key()) + && !fileSinkConfig.getPartitionFieldList().isEmpty()) { + throw new IllegalArgumentException( + "Generate empty file when no data is not supported when partition is enabled."); + } } @Override diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/AbstractWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/AbstractWriteStrategy.java index e48f8d3729d..3ae380d3079 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/AbstractWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/AbstractWriteStrategy.java @@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory; import lombok.NonNull; +import lombok.SneakyThrows; import java.io.File; import java.io.IOException; @@ -59,7 +60,7 @@ import java.util.regex.Matcher; import java.util.stream.Collectors; -public abstract class AbstractWriteStrategy implements WriteStrategy { +public abstract class AbstractWriteStrategy implements WriteStrategy { protected final Logger log = LoggerFactory.getLogger(this.getClass()); protected final FileSinkConfig fileSinkConfig; protected final CompressFormat compressFormat; @@ -248,8 +249,13 @@ public final String generateFileName(String transactionId) { * * @return the file commit information */ + @SneakyThrows @Override public Optional prepareCommit() { + if (this.needMoveFiles.isEmpty() && fileSinkConfig.isCreateEmptyFileWhenNoData()) { + String filePath = createFilePathWithoutPartition(); + this.getOrCreateOutputStream(filePath); + } this.finishAndCloseFile(); LinkedHashMap commitMap = new LinkedHashMap<>(this.needMoveFiles); LinkedHashMap> copyMap = @@ -361,10 +367,25 @@ public static String getTransactionDirPrefix(String tmpPath, String jobId, Strin return String.join(File.separator, strings); } + public String createFilePathWithoutPartition() { + return getPathWithPartitionInfo(null, true); + } + public String getOrCreateFilePathBeingWritten(@NonNull SeaTunnelRow seaTunnelRow) { LinkedHashMap> dataPartitionDirAndValuesMap = generatorPartitionDir(seaTunnelRow); - String beingWrittenFileKey = dataPartitionDirAndValuesMap.keySet().toArray()[0].toString(); + boolean noPartition = + BaseSinkConfig.NON_PARTITION.equals( + dataPartitionDirAndValuesMap.keySet().toArray()[0].toString()); + return getPathWithPartitionInfo(dataPartitionDirAndValuesMap, noPartition); + } + + private String getPathWithPartitionInfo( + LinkedHashMap> dataPartitionDirAndValuesMap, boolean noPartition) { + String beingWrittenFileKey = + noPartition + ? BaseSinkConfig.NON_PARTITION + : dataPartitionDirAndValuesMap.keySet().toArray()[0].toString(); // get filePath from beingWrittenFile String beingWrittenFilePath = beingWrittenFile.get(beingWrittenFileKey); if (beingWrittenFilePath != null) { @@ -376,8 +397,7 @@ transactionDirectory, beingWrittenFileKey, generateFileName(transactionId) }; String newBeingWrittenFilePath = String.join(File.separator, pathSegments); beingWrittenFile.put(beingWrittenFileKey, newBeingWrittenFilePath); - if (!BaseSinkConfig.NON_PARTITION.equals( - dataPartitionDirAndValuesMap.keySet().toArray()[0].toString())) { + if (!noPartition) { partitionDirAndValuesMap.putAll(dataPartitionDirAndValuesMap); } return newBeingWrittenFilePath; diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/BinaryWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/BinaryWriteStrategy.java index 06d05d62505..db3f0c1fc25 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/BinaryWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/BinaryWriteStrategy.java @@ -34,7 +34,7 @@ import java.io.IOException; import java.util.LinkedHashMap; -public class BinaryWriteStrategy extends AbstractWriteStrategy { +public class BinaryWriteStrategy extends AbstractWriteStrategy { private final LinkedHashMap beingWrittenOutputStream; private final LinkedHashMap partIndexMap; @@ -43,6 +43,11 @@ public BinaryWriteStrategy(FileSinkConfig fileSinkConfig) { super(fileSinkConfig); this.beingWrittenOutputStream = new LinkedHashMap<>(); this.partIndexMap = new LinkedHashMap<>(); + if (fileSinkConfig.isCreateEmptyFileWhenNoData()) { + throw new FileConnectorException( + FileConnectorErrorCode.FORMAT_NOT_SUPPORT, + "BinaryWriteStrategy does not support generating empty files when no data is written."); + } } @Override @@ -88,7 +93,8 @@ public String getOrCreateFilePathBeingWritten(String relativePath) { } } - private FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { + @Override + public FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { FSDataOutputStream fsDataOutputStream = beingWrittenOutputStream.get(filePath); if (fsDataOutputStream == null) { try { diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ExcelWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ExcelWriteStrategy.java index 1e4f90aadd3..0fa1d260b4d 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ExcelWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ExcelWriteStrategy.java @@ -29,7 +29,7 @@ import java.io.IOException; import java.util.LinkedHashMap; -public class ExcelWriteStrategy extends AbstractWriteStrategy { +public class ExcelWriteStrategy extends AbstractWriteStrategy { private final LinkedHashMap beingWrittenWriter; public ExcelWriteStrategy(FileSinkConfig fileSinkConfig) { @@ -41,7 +41,7 @@ public ExcelWriteStrategy(FileSinkConfig fileSinkConfig) { public void write(SeaTunnelRow seaTunnelRow) { super.write(seaTunnelRow); String filePath = getOrCreateFilePathBeingWritten(seaTunnelRow); - ExcelGenerator excelGenerator = getOrCreateExcelGenerator(filePath); + ExcelGenerator excelGenerator = getOrCreateOutputStream(filePath); excelGenerator.writeData(seaTunnelRow); } @@ -63,7 +63,8 @@ public void finishAndCloseFile() { beingWrittenWriter.clear(); } - private ExcelGenerator getOrCreateExcelGenerator(@NonNull String filePath) { + @Override + public ExcelGenerator getOrCreateOutputStream(@NonNull String filePath) { ExcelGenerator excelGenerator = this.beingWrittenWriter.get(filePath); if (excelGenerator == null) { excelGenerator = diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/JsonWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/JsonWriteStrategy.java index 23fb7893a8f..bc4a08b2d84 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/JsonWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/JsonWriteStrategy.java @@ -39,7 +39,7 @@ import java.util.LinkedHashMap; import java.util.Map; -public class JsonWriteStrategy extends AbstractWriteStrategy { +public class JsonWriteStrategy extends AbstractWriteStrategy { private final byte[] rowDelimiter; private SerializationSchema serializationSchema; private final LinkedHashMap beingWrittenOutputStream; @@ -111,7 +111,8 @@ public void finishAndCloseFile() { isFirstWrite.clear(); } - private FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { + @Override + public FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { FSDataOutputStream fsDataOutputStream = beingWrittenOutputStream.get(filePath); if (fsDataOutputStream == null) { try { diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/OrcWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/OrcWriteStrategy.java index f6b47ce4d24..366c9bb82a8 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/OrcWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/OrcWriteStrategy.java @@ -59,7 +59,7 @@ import java.util.List; import java.util.Map; -public class OrcWriteStrategy extends AbstractWriteStrategy { +public class OrcWriteStrategy extends AbstractWriteStrategy { private final LinkedHashMap beingWrittenWriter; public OrcWriteStrategy(FileSinkConfig fileSinkConfig) { @@ -71,7 +71,7 @@ public OrcWriteStrategy(FileSinkConfig fileSinkConfig) { public void write(@NonNull SeaTunnelRow seaTunnelRow) { super.write(seaTunnelRow); String filePath = getOrCreateFilePathBeingWritten(seaTunnelRow); - Writer writer = getOrCreateWriter(filePath); + Writer writer = getOrCreateOutputStream(filePath); TypeDescription schema = buildSchemaWithRowType(); VectorizedRowBatch rowBatch = schema.createRowBatch(); int i = 0; @@ -109,7 +109,8 @@ public void finishAndCloseFile() { this.beingWrittenWriter.clear(); } - private Writer getOrCreateWriter(@NonNull String filePath) { + @Override + public Writer getOrCreateOutputStream(@NonNull String filePath) { Writer writer = this.beingWrittenWriter.get(filePath); if (writer == null) { TypeDescription schema = buildSchemaWithRowType(); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ParquetWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ParquetWriteStrategy.java index b0f873296cd..41155fd8291 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ParquetWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/ParquetWriteStrategy.java @@ -74,7 +74,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -public class ParquetWriteStrategy extends AbstractWriteStrategy { +public class ParquetWriteStrategy extends AbstractWriteStrategy> { private final LinkedHashMap> beingWrittenWriter; private AvroSchemaConverter schemaConverter; private Schema schema; @@ -119,7 +119,7 @@ public void init(HadoopConf conf, String jobId, String uuidPrefix, int subTaskIn public void write(@NonNull SeaTunnelRow seaTunnelRow) { super.write(seaTunnelRow); String filePath = getOrCreateFilePathBeingWritten(seaTunnelRow); - ParquetWriter writer = getOrCreateWriter(filePath); + ParquetWriter writer = getOrCreateOutputStream(filePath); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(schema); for (Integer integer : sinkColumnsIndexInRow) { String fieldName = seaTunnelRowType.getFieldName(integer); @@ -155,7 +155,8 @@ public void finishAndCloseFile() { this.beingWrittenWriter.clear(); } - private ParquetWriter getOrCreateWriter(@NonNull String filePath) { + @Override + public ParquetWriter getOrCreateOutputStream(@NonNull String filePath) { if (schema == null) { schema = buildAvroSchemaWithRowType(seaTunnelRowType, sinkColumnsIndexInRow); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/TextWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/TextWriteStrategy.java index 77e2eb5c5b0..262448d1954 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/TextWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/TextWriteStrategy.java @@ -43,7 +43,7 @@ import java.util.LinkedHashMap; import java.util.Map; -public class TextWriteStrategy extends AbstractWriteStrategy { +public class TextWriteStrategy extends AbstractWriteStrategy { private final LinkedHashMap beingWrittenOutputStream; private final Map isFirstWrite; private final String fieldDelimiter; @@ -132,7 +132,8 @@ public void finishAndCloseFile() { isFirstWrite.clear(); } - private FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { + @Override + public FSDataOutputStream getOrCreateOutputStream(@NonNull String filePath) { FSDataOutputStream fsDataOutputStream = beingWrittenOutputStream.get(filePath); if (fsDataOutputStream == null) { try { diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/WriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/WriteStrategy.java index 24b23c9bfc3..25b84714d41 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/WriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/WriteStrategy.java @@ -27,11 +27,12 @@ import org.apache.hadoop.conf.Configuration; import java.io.Closeable; +import java.io.IOException; import java.io.Serializable; import java.util.LinkedHashMap; import java.util.List; -public interface WriteStrategy extends Transaction, Serializable, Closeable { +public interface WriteStrategy extends Transaction, Serializable, Closeable { /** * init hadoop conf * @@ -70,6 +71,8 @@ public interface WriteStrategy extends Transaction, Serializable, Closeable { */ LinkedHashMap> generatorPartitionDir(SeaTunnelRow seaTunnelRow); + T getOrCreateOutputStream(String path) throws IOException; + /** * use transaction id generate file name * diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/XmlWriteStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/XmlWriteStrategy.java index 74fa220031d..adcc57b3dcc 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/XmlWriteStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/writer/XmlWriteStrategy.java @@ -35,7 +35,7 @@ * ensures that each file is written to only once. It writes the data by passing the data row to the * corresponding XmlWriter instance. */ -public class XmlWriteStrategy extends AbstractWriteStrategy { +public class XmlWriteStrategy extends AbstractWriteStrategy { private final LinkedHashMap beingWrittenWriter; @@ -48,7 +48,7 @@ public XmlWriteStrategy(FileSinkConfig fileSinkConfig) { public void write(SeaTunnelRow seaTunnelRow) throws FileConnectorException { super.write(seaTunnelRow); String filePath = getOrCreateFilePathBeingWritten(seaTunnelRow); - XmlWriter xmlDocWriter = getOrCreateXmlWriter(filePath); + XmlWriter xmlDocWriter = getOrCreateOutputStream(filePath); xmlDocWriter.writeData(seaTunnelRow); } @@ -70,7 +70,8 @@ public void finishAndCloseFile() { this.beingWrittenWriter.clear(); } - private XmlWriter getOrCreateXmlWriter(String filePath) { + @Override + public XmlWriter getOrCreateOutputStream(String filePath) { return beingWrittenWriter.computeIfAbsent( filePath, k -> new XmlWriter(fileSinkConfig, sinkColumnsIndexInRow, seaTunnelRowType)); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/BinaryReadStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/BinaryReadStrategy.java index 7849415b32d..66f1a262834 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/BinaryReadStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/BinaryReadStrategy.java @@ -77,6 +77,7 @@ public void read(String path, String tableId, Collector output) } SeaTunnelRow row = new SeaTunnelRow(new Object[] {buffer, relativePath, partIndex}); buffer = new byte[1024]; + row.setTableId(tableId); output.collect(row); partIndex++; } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java index b728af49514..b7b61854982 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java @@ -93,6 +93,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/sink/FtpFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/sink/FtpFileSinkFactory.java index 3dc48bd3bba..0a9dfcb3d76 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/sink/FtpFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/sink/FtpFileSinkFactory.java @@ -107,6 +107,7 @@ public OptionRule optionRule() { .optional(FtpConfigOptions.FTP_CONNECTION_MODE) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/sink/HdfsFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/sink/HdfsFileSinkFactory.java index b937a8bed0a..1f5d98b0be1 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/sink/HdfsFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/sink/HdfsFileSinkFactory.java @@ -95,6 +95,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.KERBEROS_KEYTAB_PATH) .optional(BaseSinkConfig.KRB5_PATH) .optional(BaseSinkConfig.REMOTE_USER) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/jindo/sink/OssFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/jindo/sink/OssFileSinkFactory.java index 7ecbb6c3f10..2064c2937ea 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/jindo/sink/OssFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/jindo/sink/OssFileSinkFactory.java @@ -93,6 +93,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/sink/LocalFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/sink/LocalFileSinkFactory.java index 8450f139994..c6e75615be7 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/sink/LocalFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/sink/LocalFileSinkFactory.java @@ -100,6 +100,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-local/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/local/LocalFileTest.java b/seatunnel-connectors-v2/connector-file/connector-file-local/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/local/LocalFileTest.java index 8280ec5e2af..f1fffa4ece0 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-local/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/local/LocalFileTest.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-local/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/local/LocalFileTest.java @@ -25,6 +25,7 @@ import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.common.utils.FileUtils; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorException; import org.apache.seatunnel.connectors.seatunnel.file.local.sink.LocalFileSinkFactory; import org.apache.seatunnel.connectors.seatunnel.sink.SinkFlowTestUtils; @@ -33,7 +34,9 @@ import org.junit.jupiter.api.condition.DisabledOnOs; import org.junit.jupiter.api.condition.OS; +import java.io.File; import java.io.IOException; +import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -153,4 +156,78 @@ void testSingleFileMode() throws IOException { FileUtils.getFileLineNumber( "/tmp/seatunnel/LocalFileTest/only_one_file_1.txt")); } + + @Test + void testCreateEmptyFileWhenNoData() throws IOException { + Map options = + new HashMap() { + { + put("path", "/tmp/seatunnel/LocalFileTest"); + put("row_delimiter", "\n"); + put("file_name_expression", "empty_file"); + put("is_enable_transaction", false); + put("batch_size", 1); + put("create_empty_file_when_no_data", true); + } + }; + options.put("file_format_type", "text"); + FileUtils.deleteFile("/tmp/seatunnel/LocalFileTest"); + SinkFlowTestUtils.runBatchWithCheckpointDisabled( + catalogTable, + ReadonlyConfig.fromMap(options), + new LocalFileSinkFactory(), + Collections.emptyList()); + Assertions.assertEquals( + 0, + (long) + FileUtils.getFileLineNumber( + "/tmp/seatunnel/LocalFileTest/empty_file_0.txt")); + + options.put("file_format_type", "csv"); + FileUtils.deleteFile("/tmp/seatunnel/LocalFileTest"); + SinkFlowTestUtils.runBatchWithCheckpointDisabled( + catalogTable, + ReadonlyConfig.fromMap(options), + new LocalFileSinkFactory(), + Collections.emptyList()); + Assertions.assertEquals( + 0, + (long) + FileUtils.getFileLineNumber( + "/tmp/seatunnel/LocalFileTest/empty_file_0.csv")); + + options.put("enable_header_write", true); + SinkFlowTestUtils.runBatchWithCheckpointDisabled( + catalogTable, + ReadonlyConfig.fromMap(options), + new LocalFileSinkFactory(), + Collections.emptyList()); + Assertions.assertEquals( + "test\n", + FileUtils.readFileToStr( + Paths.get("/tmp/seatunnel/LocalFileTest/empty_file_0.csv"))); + + options.put("file_format_type", "parquet"); + SinkFlowTestUtils.runBatchWithCheckpointDisabled( + catalogTable, + ReadonlyConfig.fromMap(options), + new LocalFileSinkFactory(), + Collections.emptyList()); + Assertions.assertEquals( + 300, new File("/tmp/seatunnel/LocalFileTest/empty_file_0.parquet").length()); + + options.put("file_format_type", "binary"); + FileConnectorException exception = + Assertions.assertThrows( + FileConnectorException.class, + () -> + SinkFlowTestUtils.runBatchWithCheckpointDisabled( + catalogTable, + ReadonlyConfig.fromMap(options), + new LocalFileSinkFactory(), + Collections.emptyList())); + Assertions.assertEquals( + "ErrorCode:[FILE-07], ErrorDescription:[Format not support] - BinaryWriteStrategy does not support generating empty files when no data is written.", + exception.getMessage()); + } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/sink/OssFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/sink/OssFileSinkFactory.java index 246c769b76d..82a062faf1e 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/sink/OssFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/sink/OssFileSinkFactory.java @@ -105,6 +105,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .optional(SinkCommonOptions.MULTI_TABLE_SINK_REPLICA) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java index 950582a860c..492605b874e 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/sink/S3FileSinkFactory.java @@ -105,6 +105,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .optional(BaseSinkConfig.TMP_PATH) .optional(SinkCommonOptions.MULTI_TABLE_SINK_REPLICA) .build(); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/sink/SftpFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/sink/SftpFileSinkFactory.java index 4ff9c6928fd..4ab3d186517 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/sink/SftpFileSinkFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/sink/SftpFileSinkFactory.java @@ -105,6 +105,7 @@ public OptionRule optionRule() { .optional(BaseSinkConfig.TIME_FORMAT) .optional(BaseSinkConfig.SINGLE_FILE_MODE) .optional(BaseSinkConfig.BATCH_SIZE) + .optional(BaseSinkConfig.CREATE_EMPTY_FILE_WHEN_NO_DATA) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SFTPFileSystem.java b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SFTPFileSystem.java index 83fccdeb3c4..99bf4177639 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SFTPFileSystem.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SFTPFileSystem.java @@ -40,6 +40,7 @@ import java.io.OutputStream; import java.net.URI; import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Vector; @@ -154,6 +155,29 @@ private boolean exists(ChannelSftp channel, Path file) throws IOException { } } + public String quote(String path) { + byte[] _path = path.getBytes(StandardCharsets.UTF_8); + int count = 0; + for (int i = 0; i < _path.length; i++) { + byte b = _path[i]; + if (b == '\\' || b == '?' || b == '*') { + count++; + } + } + if (count == 0) { + return path; + } + byte[] _path2 = new byte[_path.length + count]; + for (int i = 0, j = 0; i < _path.length; i++) { + byte b = _path[i]; + if (b == '\\' || b == '?' || b == '*') { + _path2[j++] = '\\'; + } + _path2[j++] = b; + } + return new String(_path2, 0, _path2.length, StandardCharsets.UTF_8); + } + /** * Convenience method, so that we don't open a new connection when using this method from within * another method. Otherwise every API invocation incurs the overhead of opening/closing a TCP @@ -466,7 +490,7 @@ public FSDataInputStream open(Path f, int bufferSize) throws IOException { // the path could be a symbolic link, so get the real path absolute = new Path("/", channel.realpath(absolute.toUri().getPath())); - is = channel.get(absolute.toUri().getPath()); + is = channel.get(quote(absolute.toUri().getPath())); } catch (SftpException e) { throw new IOException(e); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SftpFileSystemTest.java b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SftpFileSystemTest.java new file mode 100644 index 00000000000..0e539350b02 --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/system/SftpFileSystemTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.sftp.system; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class SftpFileSystemTest { + + @Test + void convertAllTypeFileName() { + SFTPFileSystem sftpFileSystem = new SFTPFileSystem(); + Assertions.assertEquals( + "/home/seatunnel/tmp/seatunnel/read/wildcard/e2e.txt", + sftpFileSystem.quote("/home/seatunnel/tmp/seatunnel/read/wildcard/e2e.txt")); + // test file name with wildcard '*' + Assertions.assertEquals( + "/home/seatunnel/tmp/seatunnel/read/wildcard/e\\*e.txt", + sftpFileSystem.quote("/home/seatunnel/tmp/seatunnel/read/wildcard/e*e.txt")); + + // test file name with wildcard '?' + Assertions.assertEquals( + "/home/seatunnel/tmp/seatunnel/read/wildcard/e\\?e.txt", + sftpFileSystem.quote("/home/seatunnel/tmp/seatunnel/read/wildcard/e?e.txt")); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java index 56d0a4509df..7e906b3e4d6 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java @@ -19,18 +19,19 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.config; -import org.apache.seatunnel.shade.com.typesafe.config.Config; - import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergStreamScanStrategy; -import org.apache.iceberg.expressions.Expression; - import lombok.Getter; import lombok.ToString; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + import static org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergStreamScanStrategy.FROM_LATEST_SNAPSHOT; @Getter @@ -74,45 +75,53 @@ public class SourceConfig extends CommonConfig { .defaultValue(FROM_LATEST_SNAPSHOT) .withDescription(" the iceberg strategy of stream scanning"); - private Long startSnapshotTimestamp; - private Long startSnapshotId; - private Long endSnapshotId; + public static final Option> KEY_TABLE_LIST = + Options.key("table_list") + .listType(SourceTableConfig.class) + .noDefaultValue() + .withDescription(" the iceberg tables"); - private Long useSnapshotId; - private Long useSnapshotTimestamp; + public static final Option KEY_INCREMENT_SCAN_INTERVAL = + Options.key("increment.scan-interval") + .longType() + .defaultValue(2000L) + .withDescription(" the interval of increment scan(mills)"); - private IcebergStreamScanStrategy streamScanStrategy = KEY_STREAM_SCAN_STRATEGY.defaultValue(); - private Expression filter; - private Long splitSize; - private Integer splitLookback; - private Long splitOpenFileCost; + private long incrementScanInterval; + private List tableList; public SourceConfig(ReadonlyConfig readonlyConfig) { super(readonlyConfig); - Config pluginConfig = readonlyConfig.toConfig(); - if (pluginConfig.hasPath(KEY_START_SNAPSHOT_TIMESTAMP.key())) { - this.startSnapshotTimestamp = pluginConfig.getLong(KEY_START_SNAPSHOT_TIMESTAMP.key()); - } - if (pluginConfig.hasPath(KEY_START_SNAPSHOT_ID.key())) { - this.startSnapshotId = pluginConfig.getLong(KEY_START_SNAPSHOT_ID.key()); - } - if (pluginConfig.hasPath(KEY_END_SNAPSHOT_ID.key())) { - this.endSnapshotId = pluginConfig.getLong(KEY_END_SNAPSHOT_ID.key()); - } - if (pluginConfig.hasPath(KEY_USE_SNAPSHOT_ID.key())) { - this.useSnapshotId = pluginConfig.getLong(KEY_USE_SNAPSHOT_ID.key()); - } - if (pluginConfig.hasPath(KEY_USE_SNAPSHOT_TIMESTAMP.key())) { - this.useSnapshotTimestamp = pluginConfig.getLong(KEY_USE_SNAPSHOT_TIMESTAMP.key()); - } - if (pluginConfig.hasPath(KEY_STREAM_SCAN_STRATEGY.key())) { - this.streamScanStrategy = - pluginConfig.getEnum( - IcebergStreamScanStrategy.class, KEY_STREAM_SCAN_STRATEGY.key()); + this.incrementScanInterval = readonlyConfig.get(KEY_INCREMENT_SCAN_INTERVAL); + if (this.getTable() != null) { + SourceTableConfig tableConfig = + SourceTableConfig.builder() + .namespace(this.getNamespace()) + .table(this.getTable()) + .startSnapshotTimestamp( + readonlyConfig.get(KEY_START_SNAPSHOT_TIMESTAMP)) + .startSnapshotId(readonlyConfig.get(KEY_START_SNAPSHOT_ID)) + .endSnapshotId(readonlyConfig.get(KEY_END_SNAPSHOT_ID)) + .useSnapshotId(readonlyConfig.get(KEY_USE_SNAPSHOT_ID)) + .useSnapshotTimestamp(readonlyConfig.get(KEY_USE_SNAPSHOT_TIMESTAMP)) + .streamScanStrategy(readonlyConfig.get(KEY_STREAM_SCAN_STRATEGY)) + .build(); + this.tableList = Collections.singletonList(tableConfig); + } else { + this.tableList = + readonlyConfig.get(KEY_TABLE_LIST).stream() + .map( + tableConfig -> + tableConfig.setNamespace( + SourceConfig.this.getNamespace())) + .collect(Collectors.toList()); } } - public static SourceConfig loadConfig(ReadonlyConfig pluginConfig) { - return new SourceConfig(pluginConfig); + public SourceTableConfig getTableConfig(TablePath tablePath) { + return tableList.stream() + .filter(tableConfig -> tableConfig.getTablePath().equals(tablePath)) + .findFirst() + .get(); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceTableConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceTableConfig.java new file mode 100644 index 00000000000..99524f8373a --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceTableConfig.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.config; + +import org.apache.seatunnel.api.table.catalog.TablePath; +import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergStreamScanStrategy; +import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.SchemaUtils; + +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.expressions.Expression; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.Tolerate; + +import java.io.Serializable; + +import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_STREAM_SCAN_STRATEGY; + +@AllArgsConstructor +@Data +@Builder +public class SourceTableConfig implements Serializable { + private String namespace; + private String table; + + private Long startSnapshotTimestamp; + private Long startSnapshotId; + private Long endSnapshotId; + + private Long useSnapshotId; + private Long useSnapshotTimestamp; + + private IcebergStreamScanStrategy streamScanStrategy = KEY_STREAM_SCAN_STRATEGY.defaultValue(); + private Expression filter; + private Long splitSize; + private Integer splitLookback; + private Long splitOpenFileCost; + + @Tolerate + public SourceTableConfig() {} + + public TablePath getTablePath() { + String[] paths = table.split("\\."); + if (paths.length == 1) { + return TablePath.of(namespace, table); + } + if (paths.length == 2) { + return TablePath.of(paths[0], paths[1]); + } + String namespace = table.substring(0, table.lastIndexOf("\\.")); + return TablePath.of(namespace, table); + } + + public TableIdentifier getTableIdentifier() { + return SchemaUtils.toIcebergTableIdentifier(getTablePath()); + } + + public SourceTableConfig setNamespace(String namespace) { + this.namespace = namespace; + return this; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java index 1028ae21b4a..22f42a53bff 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java @@ -158,6 +158,7 @@ public void close() throws IOException { if (writer != null) { writer.close(); } + icebergTableLoader.close(); } finally { results.clear(); } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java index 008289690a9..7205e1108b3 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java @@ -33,10 +33,11 @@ public class IcebergAggregatedCommitter implements SinkAggregatedCommitter { + private final IcebergTableLoader tableLoader; private final IcebergFilesCommitter filesCommitter; public IcebergAggregatedCommitter(SinkConfig config, CatalogTable catalogTable) { - IcebergTableLoader tableLoader = IcebergTableLoader.create(config, catalogTable); + this.tableLoader = IcebergTableLoader.create(config, catalogTable); this.filesCommitter = IcebergFilesCommitter.of(config, tableLoader); } @@ -68,5 +69,7 @@ public IcebergAggregatedCommitInfo combine(List commitInfos) public void abort(List aggregatedCommitInfo) throws Exception {} @Override - public void close() throws IOException {} + public void close() throws IOException { + this.tableLoader.close(); + } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java index c56f3f2f00e..4ed750e0c83 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java @@ -17,10 +17,7 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source; -import org.apache.seatunnel.shade.com.typesafe.config.Config; - import org.apache.seatunnel.api.common.JobContext; -import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceReader; @@ -28,34 +25,30 @@ import org.apache.seatunnel.api.source.SupportColumnProjection; import org.apache.seatunnel.api.source.SupportParallelism; import org.apache.seatunnel.api.table.catalog.CatalogTable; -import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; -import org.apache.seatunnel.api.table.catalog.schema.TableSchemaOptions; -import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.type.SeaTunnelRow; -import org.apache.seatunnel.api.table.type.SeaTunnelRowType; -import org.apache.seatunnel.common.config.CheckConfigUtil; -import org.apache.seatunnel.common.config.CheckResult; import org.apache.seatunnel.common.constants.JobMode; -import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergCatalogLoader; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.IcebergBatchSplitEnumerator; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.IcebergSplitEnumeratorState; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.IcebergStreamSplitEnumerator; -import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergScanContext; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.reader.IcebergSourceReader; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; -import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.SchemaUtils; +import org.apache.commons.lang3.tuple.Pair; import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; import lombok.SneakyThrows; import java.util.ArrayList; -import java.util.Collections; +import java.util.HashMap; import java.util.List; - -import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import java.util.Map; +import java.util.stream.Collectors; public class IcebergSource implements SeaTunnelSource< @@ -66,23 +59,21 @@ public class IcebergSource private static final long serialVersionUID = 4343414808223919870L; private final SourceConfig sourceConfig; - private final Schema tableSchema; - private final Schema projectedSchema; - private final SeaTunnelRowType seaTunnelRowType; + private final Map catalogTables; + private final Map> tableSchemaProjections; private JobContext jobContext; - private final CatalogTable catalogTable; - - public IcebergSource(ReadonlyConfig config, CatalogTable catalogTable) { - this.sourceConfig = SourceConfig.loadConfig(config); - this.tableSchema = loadIcebergSchema(sourceConfig); - this.seaTunnelRowType = loadSeaTunnelRowType(tableSchema, config.toConfig()); - this.projectedSchema = tableSchema.select(seaTunnelRowType.getFieldNames()); - this.catalogTable = catalogTable; + + public IcebergSource(SourceConfig config, List catalogTables) { + this.sourceConfig = config; + this.catalogTables = + catalogTables.stream() + .collect(Collectors.toMap(CatalogTable::getTablePath, table -> table)); + this.tableSchemaProjections = loadIcebergSchemaProjections(config, this.catalogTables); } @Override public List getProducedCatalogTables() { - return Collections.singletonList(catalogTable); + return new ArrayList<>(catalogTables.values()); } @Override @@ -91,46 +82,30 @@ public String getPluginName() { } @SneakyThrows - private Schema loadIcebergSchema(SourceConfig sourceConfig) { - try (IcebergTableLoader icebergTableLoader = - IcebergTableLoader.create(sourceConfig, catalogTable)) { - icebergTableLoader.open(); - return icebergTableLoader.loadTable().schema(); - } - } - - private SeaTunnelRowType loadSeaTunnelRowType(Schema tableSchema, Config pluginConfig) { - List columnNames = new ArrayList<>(tableSchema.columns().size()); - List> columnDataTypes = new ArrayList<>(tableSchema.columns().size()); - for (Types.NestedField column : tableSchema.columns()) { - columnNames.add(column.name()); - columnDataTypes.add(SchemaUtils.toSeaTunnelType(column.name(), column.type())); - } - SeaTunnelRowType originalRowType = - new SeaTunnelRowType( - columnNames.toArray(new String[0]), - columnDataTypes.toArray(new SeaTunnelDataType[0])); - - CheckResult checkResult = - CheckConfigUtil.checkAllExists(pluginConfig, TableSchemaOptions.SCHEMA.key()); - if (checkResult.isSuccess()) { - SeaTunnelRowType projectedRowType = - CatalogTableUtil.buildWithConfig(pluginConfig).getSeaTunnelRowType(); - for (int i = 0; i < projectedRowType.getFieldNames().length; i++) { - String fieldName = projectedRowType.getFieldName(i); - SeaTunnelDataType projectedFieldType = projectedRowType.getFieldType(i); - int originalFieldIndex = originalRowType.indexOf(fieldName); - SeaTunnelDataType originalFieldType = - originalRowType.getFieldType(originalFieldIndex); - checkArgument( - projectedFieldType.equals(originalFieldType), - String.format( - "Illegal field: %s, original: %s <-> projected: %s", - fieldName, originalFieldType, projectedFieldType)); + private Map> loadIcebergSchemaProjections( + SourceConfig config, Map tables) { + IcebergCatalogLoader catalogFactory = new IcebergCatalogLoader(config); + Catalog catalog = catalogFactory.loadCatalog(); + + Map> icebergTables = new HashMap<>(); + try { + for (TablePath tablePath : tables.keySet()) { + CatalogTable catalogTable = tables.get(tablePath); + Table icebergTable = + catalog.loadTable( + TableIdentifier.of( + tablePath.getDatabaseName(), tablePath.getTableName())); + Schema icebergSchema = icebergTable.schema(); + Schema projectedSchema = + icebergSchema.select(catalogTable.getTableSchema().getFieldNames()); + icebergTables.put(tablePath, Pair.of(icebergSchema, projectedSchema)); + } + } finally { + if (catalog instanceof AutoCloseable) { + ((AutoCloseable) catalog).close(); } - return projectedRowType; } - return originalRowType; + return icebergTables; } @Override @@ -149,12 +124,7 @@ public void setJobContext(JobContext jobContext) { public SourceReader createReader( SourceReader.Context readerContext) { return new IcebergSourceReader( - readerContext, - seaTunnelRowType, - tableSchema, - projectedSchema, - sourceConfig, - catalogTable); + readerContext, sourceConfig, catalogTables, tableSchemaProjections); } @Override @@ -163,18 +133,10 @@ public SourceReader createReader( SourceSplitEnumerator.Context enumeratorContext) { if (Boundedness.BOUNDED.equals(getBoundedness())) { return new IcebergBatchSplitEnumerator( - enumeratorContext, - IcebergScanContext.scanContext(sourceConfig, projectedSchema), - sourceConfig, - null, - catalogTable); + enumeratorContext, sourceConfig, catalogTables, tableSchemaProjections); } return new IcebergStreamSplitEnumerator( - enumeratorContext, - IcebergScanContext.streamScanContext(sourceConfig, projectedSchema), - sourceConfig, - null, - catalogTable); + enumeratorContext, sourceConfig, catalogTables, tableSchemaProjections); } @Override @@ -185,16 +147,16 @@ public SourceReader createReader( if (Boundedness.BOUNDED.equals(getBoundedness())) { return new IcebergBatchSplitEnumerator( enumeratorContext, - IcebergScanContext.scanContext(sourceConfig, projectedSchema), sourceConfig, - checkpointState, - catalogTable); + catalogTables, + tableSchemaProjections, + checkpointState); } return new IcebergStreamSplitEnumerator( enumeratorContext, - IcebergScanContext.streamScanContext(sourceConfig, projectedSchema), sourceConfig, - checkpointState, - catalogTable); + catalogTables, + tableSchemaProjections, + checkpointState); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java index 6e7c05c9ab1..e1c7424cc01 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java @@ -33,16 +33,19 @@ import org.apache.seatunnel.connectors.seatunnel.iceberg.catalog.IcebergCatalog; import org.apache.seatunnel.connectors.seatunnel.iceberg.catalog.IcebergCatalogFactory; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; -import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import com.google.auto.service.AutoService; import lombok.extern.slf4j.Slf4j; import java.io.Serializable; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_CASE_SENSITIVE; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_END_SNAPSHOT_ID; +import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_INCREMENT_SCAN_INTERVAL; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_START_SNAPSHOT_ID; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_START_SNAPSHOT_TIMESTAMP; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_STREAM_SCAN_STRATEGY; @@ -63,9 +66,9 @@ public OptionRule optionRule() { return OptionRule.builder() .required( CommonConfig.KEY_CATALOG_NAME, - SinkConfig.KEY_NAMESPACE, - SinkConfig.KEY_TABLE, - SinkConfig.CATALOG_PROPS) + CommonConfig.KEY_NAMESPACE, + CommonConfig.CATALOG_PROPS) + .exclusive(CommonConfig.KEY_TABLE, SourceConfig.KEY_TABLE_LIST) .optional( TableSchemaOptions.SCHEMA, KEY_CASE_SENSITIVE, @@ -74,7 +77,8 @@ public OptionRule optionRule() { KEY_END_SNAPSHOT_ID, KEY_USE_SNAPSHOT_ID, KEY_USE_SNAPSHOT_TIMESTAMP, - KEY_STREAM_SCAN_STRATEGY) + KEY_STREAM_SCAN_STRATEGY, + KEY_INCREMENT_SCAN_INTERVAL) .build(); } @@ -83,24 +87,37 @@ public OptionRule optionRule() { TableSource createSource(TableSourceFactoryContext context) { ReadonlyConfig options = context.getOptions(); SourceConfig config = new SourceConfig(options); - TablePath tablePath = TablePath.of(config.getNamespace(), config.getTable()); CatalogTable catalogTable; if (options.get(TableSchemaOptions.SCHEMA) != null) { + TablePath tablePath = config.getTableList().get(0).getTablePath(); catalogTable = CatalogTableUtil.buildWithConfig(factoryIdentifier(), options); TableIdentifier tableIdentifier = TableIdentifier.of(catalogTable.getCatalogName(), tablePath); CatalogTable table = CatalogTable.of(tableIdentifier, catalogTable); - return () -> (SeaTunnelSource) new IcebergSource(options, table); - } else { - // build iceberg catalog - IcebergCatalogFactory icebergCatalogFactory = new IcebergCatalogFactory(); - IcebergCatalog catalog = - (IcebergCatalog) - icebergCatalogFactory.createCatalog(factoryIdentifier(), options); + return () -> + (SeaTunnelSource) + new IcebergSource(config, Collections.singletonList(table)); + } + + try (IcebergCatalog catalog = + (IcebergCatalog) + new IcebergCatalogFactory().createCatalog(factoryIdentifier(), options)) { catalog.open(); - catalogTable = catalog.getTable(tablePath); + + if (config.getTable() != null) { + TablePath tablePath = config.getTableList().get(0).getTablePath(); + catalogTable = catalog.getTable(tablePath); + return () -> + (SeaTunnelSource) + new IcebergSource(config, Collections.singletonList(catalogTable)); + } + + List catalogTables = + config.getTableList().stream() + .map(tableConfig -> catalog.getTable(tableConfig.getTablePath())) + .collect(Collectors.toList()); return () -> - (SeaTunnelSource) new IcebergSource(options, catalogTable); + (SeaTunnelSource) new IcebergSource(config, catalogTables); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/AbstractSplitEnumerator.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/AbstractSplitEnumerator.java index 73cb71f45fe..8ccdfd6e0f2 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/AbstractSplitEnumerator.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/AbstractSplitEnumerator.java @@ -19,14 +19,18 @@ import org.apache.seatunnel.api.source.SourceSplitEnumerator; import org.apache.seatunnel.api.table.catalog.CatalogTable; -import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.api.table.catalog.TablePath; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergCatalogLoader; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.iceberg.Schema; import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; -import lombok.Getter; -import lombok.NonNull; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import java.io.IOException; @@ -37,6 +41,11 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; @Slf4j public abstract class AbstractSplitEnumerator @@ -44,75 +53,153 @@ public abstract class AbstractSplitEnumerator protected final Context context; protected final SourceConfig sourceConfig; + protected final Map tables; + protected final Map> tableSchemaProjections; + protected final Catalog icebergCatalog; + protected final Object stateLock = new Object(); + + protected final BlockingQueue pendingTables; protected final Map> pendingSplits; - protected IcebergTableLoader icebergTableLoader; - @Getter private volatile boolean isOpen = false; - private CatalogTable catalogTable; + public AbstractSplitEnumerator( + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections) { + this(context, sourceConfig, catalogTables, tableSchemaProjections, null); + } public AbstractSplitEnumerator( - @NonNull SourceSplitEnumerator.Context context, - @NonNull SourceConfig sourceConfig, - @NonNull Map> pendingSplits, - CatalogTable catalogTable) { + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections, + IcebergSplitEnumeratorState state) { this.context = context; this.sourceConfig = sourceConfig; - this.pendingSplits = new HashMap<>(pendingSplits); - this.catalogTable = catalogTable; + this.tables = catalogTables; + this.tableSchemaProjections = tableSchemaProjections; + this.icebergCatalog = new IcebergCatalogLoader(sourceConfig).loadCatalog(); + this.pendingTables = new ArrayBlockingQueue<>(catalogTables.size()); + this.pendingSplits = new HashMap<>(); + if (state == null) { + this.pendingTables.addAll( + catalogTables.values().stream() + .map(CatalogTable::getTablePath) + .collect(Collectors.toList())); + } else { + this.pendingTables.addAll(state.getPendingTables()); + state.getPendingSplits().values().stream() + .flatMap( + (Function< + List, + Stream>) + splits -> splits.stream()) + .map( + (Function) + split -> { + // TODO: Waiting for old version migration to complete + // before remove + if (split.getTablePath() == null) { + new IcebergFileScanTaskSplit( + catalogTables.values().stream() + .findFirst() + .get() + .getTablePath(), + split.getTask(), + split.getRecordOffset()); + } + return null; + }) + .forEach( + split -> + pendingSplits + .computeIfAbsent( + getSplitOwner( + split.splitId(), + context.currentParallelism()), + r -> new ArrayList<>()) + .add(split)); + } } @Override public void open() { - icebergTableLoader = IcebergTableLoader.create(sourceConfig, catalogTable); - icebergTableLoader.open(); - isOpen = true; - } - - @Override - public void run() { - refreshPendingSplits(); - assignPendingSplits(context.registeredReaders()); - } - - @Override - public void close() throws IOException { - icebergTableLoader.close(); - isOpen = false; + log.info("Open split enumerator."); } @Override public void addSplitsBack(List splits, int subtaskId) { - addPendingSplits(splits); - if (context.registeredReaders().contains(subtaskId)) { - assignPendingSplits(Collections.singleton(subtaskId)); + if (!splits.isEmpty()) { + synchronized (stateLock) { + addPendingSplits(splits); + if (context.registeredReaders().contains(subtaskId)) { + assignPendingSplits(Collections.singleton(subtaskId)); + } else { + log.warn( + "Reader {} is not registered. Pending splits {} are not assigned.", + subtaskId, + splits); + } + } } + log.info("Add back splits {} to JdbcSourceSplitEnumerator.", splits.size()); } @Override public int currentUnassignedSplitSize() { - return pendingSplits.size(); + if (!pendingTables.isEmpty()) { + return pendingTables.size(); + } + if (!pendingSplits.isEmpty()) { + return pendingSplits.values().stream().mapToInt(List::size).sum(); + } + return 0; } + @Override + public void handleSplitRequest(int subtaskId) {} + @Override public void registerReader(int subtaskId) { log.debug("Adding reader {} to IcebergSourceEnumerator.", subtaskId); - assignPendingSplits(Collections.singleton(subtaskId)); + synchronized (stateLock) { + assignPendingSplits(Collections.singleton(subtaskId)); + } } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception {} - protected void refreshPendingSplits() { - List newSplits = loadNewSplits(icebergTableLoader.loadTable()); - addPendingSplits(newSplits); + @SneakyThrows + @Override + public void close() throws IOException { + log.info("Close split enumerator."); + if (icebergCatalog instanceof AutoCloseable) { + ((AutoCloseable) icebergCatalog).close(); + } + } + + protected Table loadTable(TablePath tablePath) { + return icebergCatalog.loadTable( + TableIdentifier.of(tablePath.getDatabaseName(), tablePath.getTableName())); } - protected abstract List loadNewSplits(Table table); + protected void checkThrowInterruptedException() throws InterruptedException { + if (Thread.currentThread().isInterrupted()) { + log.info("Enumerator thread is interrupted."); + throw new InterruptedException("Enumerator thread is interrupted."); + } + } + + private static int getSplitOwner(String splitId, int numReaders) { + return (splitId.hashCode() & Integer.MAX_VALUE) % numReaders; + } - private void addPendingSplits(Collection newSplits) { + protected void addPendingSplits(Collection newSplits) { int numReaders = context.currentParallelism(); for (IcebergFileScanTaskSplit newSplit : newSplits) { - int ownerReader = (newSplit.splitId().hashCode() & Integer.MAX_VALUE) % numReaders; + int ownerReader = getSplitOwner(newSplit.splitId(), numReaders); pendingSplits.computeIfAbsent(ownerReader, r -> new ArrayList<>()).add(newSplit); log.info("Assigning {} to {} reader.", newSplit, ownerReader); } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergBatchSplitEnumerator.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergBatchSplitEnumerator.java index b0adfb011c1..72f3f56635d 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergBatchSplitEnumerator.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergBatchSplitEnumerator.java @@ -17,61 +17,88 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator; -import org.apache.seatunnel.api.source.SourceSplitEnumerator; import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergScanContext; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergScanSplitPlanner; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.iceberg.Schema; import org.apache.iceberg.Table; -import lombok.NonNull; import lombok.extern.slf4j.Slf4j; -import java.util.Collections; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; @Slf4j public class IcebergBatchSplitEnumerator extends AbstractSplitEnumerator { - private final IcebergScanContext icebergScanContext; + public IcebergBatchSplitEnumerator( + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections) { + this(context, sourceConfig, catalogTables, tableSchemaProjections, null); + } public IcebergBatchSplitEnumerator( - @NonNull SourceSplitEnumerator.Context context, - @NonNull IcebergScanContext icebergScanContext, - @NonNull SourceConfig sourceConfig, - IcebergSplitEnumeratorState restoreState, - CatalogTable catalogTable) { - super( - context, - sourceConfig, - restoreState != null ? restoreState.getPendingSplits() : Collections.emptyMap(), - catalogTable); - this.icebergScanContext = icebergScanContext; + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections, + IcebergSplitEnumeratorState state) { + super(context, sourceConfig, catalogTables, tableSchemaProjections, state); } @Override - public void run() { - super.run(); - + public void run() throws Exception { Set readers = context.registeredReaders(); + while (!pendingTables.isEmpty()) { + synchronized (stateLock) { + checkThrowInterruptedException(); + + TablePath tablePath = pendingTables.poll(); + log.info("Splitting table {}.", tablePath); + + Collection splits = loadSplits(tablePath); + log.info("Split table {} into {} splits.", tablePath, splits.size()); + + addPendingSplits(splits); + } + + synchronized (stateLock) { + assignPendingSplits(readers); + } + } + log.debug( "No more splits to assign." + " Sending NoMoreSplitsEvent to reader {}.", readers); readers.forEach(context::signalNoMoreSplits); } @Override - public IcebergSplitEnumeratorState snapshotState(long checkpointId) { - return new IcebergSplitEnumeratorState(null, pendingSplits); + public IcebergSplitEnumeratorState snapshotState(long checkpointId) throws Exception { + synchronized (stateLock) { + return new IcebergSplitEnumeratorState( + new ArrayList<>(pendingTables), new HashMap<>(pendingSplits)); + } } - @Override - public void handleSplitRequest(int subtaskId) {} - - @Override - protected List loadNewSplits(Table table) { - return IcebergScanSplitPlanner.planSplits(table, icebergScanContext); + private List loadSplits(TablePath tablePath) { + Table table = loadTable(tablePath); + Pair tableSchemaProjection = tableSchemaProjections.get(tablePath); + IcebergScanContext scanContext = + IcebergScanContext.scanContext( + sourceConfig, + sourceConfig.getTableConfig(tablePath), + tableSchemaProjection.getRight()); + return IcebergScanSplitPlanner.planSplits(table, scanContext); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergSplitEnumeratorState.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergSplitEnumeratorState.java index 4b170989898..9637cd0ec12 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergSplitEnumeratorState.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergSplitEnumeratorState.java @@ -17,25 +17,66 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; -import lombok.AllArgsConstructor; import lombok.Getter; -import lombok.Setter; import lombok.ToString; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; @Getter -@Setter -@AllArgsConstructor @ToString public class IcebergSplitEnumeratorState implements Serializable { private static final long serialVersionUID = -529307606400995298L; - private final IcebergEnumeratorPosition lastEnumeratedPosition; - private final Map> pendingSplits; + // TODO: Waiting for migration to complete before remove + @Deprecated private IcebergEnumeratorPosition lastEnumeratedPosition; + + private Collection pendingTables; + private Map> pendingSplits; + private Map tableOffsets; + + public IcebergSplitEnumeratorState( + Collection pendingTables, + Map> pendingSplits) { + this(pendingTables, pendingSplits, Collections.emptyMap()); + } + + public IcebergSplitEnumeratorState( + Collection pendingTables, + Map> pendingSplits, + Map tableOffsets) { + this.pendingTables = pendingTables; + this.pendingSplits = pendingSplits; + this.tableOffsets = tableOffsets; + } + + // TODO: Waiting for migration to complete before remove + @Deprecated + public IcebergSplitEnumeratorState( + IcebergEnumeratorPosition lastEnumeratedPosition, + Map> pendingSplits) { + this.lastEnumeratedPosition = lastEnumeratedPosition; + this.pendingSplits = pendingSplits; + this.pendingTables = new ArrayList<>(); + this.tableOffsets = new HashMap<>(); + } + + // TODO: Waiting for migration to complete before remove + @Deprecated + public IcebergSplitEnumeratorState setPendingTable(TablePath table) { + if (lastEnumeratedPosition != null) { + this.pendingTables.add(table); + this.tableOffsets.put(table, lastEnumeratedPosition); + } + return this; + } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergStreamSplitEnumerator.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergStreamSplitEnumerator.java index 266985a0775..14f02dd58d7 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergStreamSplitEnumerator.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/IcebergStreamSplitEnumerator.java @@ -17,79 +17,125 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator; -import org.apache.seatunnel.api.source.SourceSplitEnumerator; import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergScanContext; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergScanSplitPlanner; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.iceberg.Schema; import org.apache.iceberg.Table; -import lombok.NonNull; import lombok.extern.slf4j.Slf4j; +import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; -import java.util.concurrent.atomic.AtomicReference; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; @Slf4j public class IcebergStreamSplitEnumerator extends AbstractSplitEnumerator { - private final IcebergScanContext icebergScanContext; - private final AtomicReference enumeratorPosition; + private final ConcurrentMap tableOffsets; + private volatile boolean initialized = false; public IcebergStreamSplitEnumerator( - @NonNull SourceSplitEnumerator.Context context, - @NonNull IcebergScanContext icebergScanContext, - @NonNull SourceConfig sourceConfig, - IcebergSplitEnumeratorState restoreState, - CatalogTable catalogTable) { - super( - context, - sourceConfig, - restoreState != null ? restoreState.getPendingSplits() : Collections.emptyMap(), - catalogTable); - this.icebergScanContext = icebergScanContext; - this.enumeratorPosition = new AtomicReference<>(); - if (restoreState != null) { - enumeratorPosition.set(restoreState.getLastEnumeratedPosition()); + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections) { + this(context, sourceConfig, catalogTables, tableSchemaProjections, null); + } + + public IcebergStreamSplitEnumerator( + Context context, + SourceConfig sourceConfig, + Map catalogTables, + Map> tableSchemaProjections, + IcebergSplitEnumeratorState state) { + super(context, sourceConfig, catalogTables, tableSchemaProjections, state); + this.tableOffsets = new ConcurrentHashMap<>(); + if (state != null) { + if (state.getLastEnumeratedPosition() != null) { + // TODO: Waiting for migration to complete before remove + state.setPendingTable( + catalogTables.values().stream().findFirst().get().getTablePath()); + } + this.tableOffsets.putAll(state.getTableOffsets()); + } + } + + @Override + public void run() throws Exception { + Set readers = context.registeredReaders(); + while (true) { + for (TablePath tablePath : pendingTables) { + checkThrowInterruptedException(); + + synchronized (stateLock) { + log.info("Scan table {}.", tablePath); + + Collection splits = loadSplits(tablePath); + log.info("Scan table {} into {} splits.", tablePath, splits.size()); + addPendingSplits(splits); + assignPendingSplits(readers); + } + } + + if (Boolean.FALSE.equals(initialized)) { + initialized = true; + } + + stateLock.wait(sourceConfig.getIncrementScanInterval()); } } @Override public IcebergSplitEnumeratorState snapshotState(long checkpointId) throws Exception { - return new IcebergSplitEnumeratorState(enumeratorPosition.get(), pendingSplits); + synchronized (stateLock) { + return new IcebergSplitEnumeratorState( + new ArrayList<>(pendingTables), + new HashMap<>(pendingSplits), + new HashMap<>(tableOffsets)); + } } @Override public void handleSplitRequest(int subtaskId) { - if (isOpen()) { - synchronized (this) { - if (pendingSplits.isEmpty() || pendingSplits.get(subtaskId) == null) { - refreshPendingSplits(); - } - assignPendingSplits(Collections.singleton(subtaskId)); - } + if (initialized) { + stateLock.notifyAll(); } } - @Override - protected List loadNewSplits(Table table) { + private List loadSplits(TablePath tablePath) { + Table table = loadTable(tablePath); + IcebergEnumeratorPosition offset = tableOffsets.get(tablePath); + Pair tableSchemaProjection = tableSchemaProjections.get(tablePath); + IcebergScanContext scanContext = + IcebergScanContext.streamScanContext( + sourceConfig, + sourceConfig.getTableConfig(tablePath), + tableSchemaProjection.getRight()); IcebergEnumerationResult result = - IcebergScanSplitPlanner.planStreamSplits( - table, icebergScanContext, enumeratorPosition.get()); - if (!Objects.equals(result.getFromPosition(), enumeratorPosition.get())) { + IcebergScanSplitPlanner.planStreamSplits(table, scanContext, offset); + if (!Objects.equals(result.getFromPosition(), offset)) { log.info( "Skip {} loaded splits because the scan starting position doesn't match " + "the current enumerator position: enumerator position = {}, scan starting position = {}", result.getSplits().size(), - enumeratorPosition.get(), + tableOffsets.get(tablePath), result.getFromPosition()); return Collections.emptyList(); } else { - enumeratorPosition.set(result.getToPosition()); + tableOffsets.put(tablePath, result.getToPosition()); log.debug("Update enumerator position to {}", result.getToPosition()); return result.getSplits(); } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanContext.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanContext.java index 7b29c80f678..09b2145c9fb 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanContext.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanContext.java @@ -17,7 +17,9 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceTableConfig; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Expression; @@ -31,6 +33,7 @@ @ToString public class IcebergScanContext { + private final TablePath tablePath; private final boolean streaming; private final IcebergStreamScanStrategy streamScanStrategy; @@ -59,27 +62,30 @@ public IcebergScanContext copyWithAppendsBetween( .build(); } - public static IcebergScanContext scanContext(SourceConfig sourceConfig, Schema schema) { + public static IcebergScanContext scanContext( + SourceConfig sourceConfig, SourceTableConfig tableConfig, Schema schema) { return IcebergScanContext.builder() - .startSnapshotTimestamp(sourceConfig.getStartSnapshotTimestamp()) - .startSnapshotId(sourceConfig.getStartSnapshotId()) - .endSnapshotId(sourceConfig.getEndSnapshotId()) - .useSnapshotId(sourceConfig.getUseSnapshotId()) - .useSnapshotTimestamp(sourceConfig.getUseSnapshotTimestamp()) + .tablePath(tableConfig.getTablePath()) + .startSnapshotTimestamp(tableConfig.getStartSnapshotTimestamp()) + .startSnapshotId(tableConfig.getStartSnapshotId()) + .endSnapshotId(tableConfig.getEndSnapshotId()) + .useSnapshotId(tableConfig.getUseSnapshotId()) + .useSnapshotTimestamp(tableConfig.getUseSnapshotTimestamp()) .caseSensitive(sourceConfig.isCaseSensitive()) .schema(schema) - .filter(sourceConfig.getFilter()) - .splitSize(sourceConfig.getSplitSize()) - .splitLookback(sourceConfig.getSplitLookback()) - .splitOpenFileCost(sourceConfig.getSplitOpenFileCost()) + .filter(tableConfig.getFilter()) + .splitSize(tableConfig.getSplitSize()) + .splitLookback(tableConfig.getSplitLookback()) + .splitOpenFileCost(tableConfig.getSplitOpenFileCost()) .build(); } - public static IcebergScanContext streamScanContext(SourceConfig sourceConfig, Schema schema) { - return scanContext(sourceConfig, schema) + public static IcebergScanContext streamScanContext( + SourceConfig sourceConfig, SourceTableConfig tableConfig, Schema schema) { + return scanContext(sourceConfig, tableConfig, schema) .toBuilder() .streaming(true) - .streamScanStrategy(sourceConfig.getStreamScanStrategy()) + .streamScanStrategy(tableConfig.getStreamScanStrategy()) .build(); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java index d006241da4c..404b4ab5ebe 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java @@ -174,7 +174,7 @@ public static List planSplits( List splits = new ArrayList<>(); for (CombinedScanTask combinedScanTask : tasksIterable) { for (FileScanTask fileScanTask : combinedScanTask.files()) { - splits.add(new IcebergFileScanTaskSplit(fileScanTask)); + splits.add(new IcebergFileScanTaskSplit(context.getTablePath(), fileScanTask)); } } return splits; diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskSplitReader.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskSplitReader.java index 7c472f9af7f..7b2236c5399 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskSplitReader.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskSplitReader.java @@ -44,10 +44,12 @@ public CloseableIterator open(@NonNull IcebergFileScanTaskSplit sp OffsetSeekIterator seekIterator = new OffsetSeekIterator<>(iterator); seekIterator.seek(split.getRecordOffset()); + String tableId = split.getTablePath().getFullName(); return CloseableIterator.transform( seekIterator, record -> { SeaTunnelRow seaTunnelRow = deserializer.deserialize(record); + seaTunnelRow.setTableId(tableId); split.setRecordOffset(split.getRecordOffset() + 1); return seaTunnelRow; }); diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergSourceReader.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergSourceReader.java index 83f42879d0b..b71b73f0898 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergSourceReader.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergSourceReader.java @@ -21,25 +21,33 @@ import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.source.SourceReader; import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.type.SeaTunnelRow; -import org.apache.seatunnel.api.table.type.SeaTunnelRowType; -import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergCatalogLoader; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceTableConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.data.DefaultDeserializer; import org.apache.seatunnel.connectors.seatunnel.iceberg.data.Deserializer; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.split.IcebergFileScanTaskSplit; +import org.apache.commons.lang3.tuple.Pair; import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.io.CloseableIterator; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; +import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; -import java.util.LinkedList; import java.util.List; -import java.util.Queue; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.LinkedBlockingQueue; @Slf4j public class IcebergSourceReader implements SourceReader { @@ -47,72 +55,95 @@ public class IcebergSourceReader implements SourceReader pendingSplits; - private final Deserializer deserializer; - private final Schema tableSchema; - private final Schema projectedSchema; private final SourceConfig sourceConfig; + private final Map tables; + private final Map> tableSchemaProjections; + private final BlockingQueue pendingSplits; - private IcebergTableLoader icebergTableLoader; - private IcebergFileScanTaskSplitReader icebergFileScanTaskSplitReader; + private volatile IcebergFileScanTaskSplit currentReadSplit; + private volatile boolean noMoreSplitsAssignment; - private IcebergFileScanTaskSplit currentReadSplit; - private boolean noMoreSplitsAssignment; - - private CatalogTable catalogTable; + private Catalog catalog; + private ConcurrentMap tableReaders; public IcebergSourceReader( @NonNull SourceReader.Context context, - @NonNull SeaTunnelRowType seaTunnelRowType, - @NonNull Schema tableSchema, - @NonNull Schema projectedSchema, @NonNull SourceConfig sourceConfig, - CatalogTable catalogTable) { + @NonNull Map tables, + @NonNull Map> tableSchemaProjections) { this.context = context; - this.pendingSplits = new LinkedList<>(); - this.catalogTable = catalogTable; - this.deserializer = new DefaultDeserializer(seaTunnelRowType, projectedSchema); - this.tableSchema = tableSchema; - this.projectedSchema = projectedSchema; this.sourceConfig = sourceConfig; + this.tables = tables; + this.tableSchemaProjections = tableSchemaProjections; + this.pendingSplits = new LinkedBlockingQueue<>(); + this.tableReaders = new ConcurrentHashMap<>(); } @Override public void open() { - icebergTableLoader = IcebergTableLoader.create(sourceConfig, catalogTable); - icebergTableLoader.open(); - - icebergFileScanTaskSplitReader = - new IcebergFileScanTaskSplitReader( - deserializer, - IcebergFileScanTaskReader.builder() - .fileIO(icebergTableLoader.loadTable().io()) - .tableSchema(tableSchema) - .projectedSchema(projectedSchema) - .caseSensitive(sourceConfig.isCaseSensitive()) - .reuseContainers(true) - .build()); + IcebergCatalogLoader catalogFactory = new IcebergCatalogLoader(sourceConfig); + catalog = catalogFactory.loadCatalog(); } @Override public void close() throws IOException { - if (icebergFileScanTaskSplitReader != null) { - icebergFileScanTaskSplitReader.close(); + if (catalog != null && catalog instanceof Closeable) { + ((Closeable) catalog).close(); + } + tableReaders.forEach((tablePath, reader) -> reader.close()); + } + + private IcebergFileScanTaskSplitReader getOrCreateTableReader(TablePath tablePath) { + IcebergFileScanTaskSplitReader tableReader = tableReaders.get(tablePath); + if (tableReader != null) { + return tableReader; } - icebergTableLoader.close(); + + if (Boundedness.BOUNDED.equals(context.getBoundedness())) { + // clean up table readers if the source is bounded + tableReaders.forEach((key, value) -> value.close()); + tableReaders.clear(); + } + + return tableReaders.computeIfAbsent( + tablePath, + key -> { + SourceTableConfig tableConfig = sourceConfig.getTableConfig(key); + CatalogTable catalogTable = tables.get(key); + Pair pair = tableSchemaProjections.get(key); + Schema tableSchema = pair.getLeft(); + Schema projectedSchema = pair.getRight(); + Deserializer deserializer = + new DefaultDeserializer( + catalogTable.getSeaTunnelRowType(), projectedSchema); + + Table icebergTable = catalog.loadTable(tableConfig.getTableIdentifier()); + return new IcebergFileScanTaskSplitReader( + deserializer, + IcebergFileScanTaskReader.builder() + .fileIO(icebergTable.io()) + .tableSchema(tableSchema) + .projectedSchema(projectedSchema) + .caseSensitive(sourceConfig.isCaseSensitive()) + .reuseContainers(true) + .build()); + }); } @Override public void pollNext(Collector output) throws Exception { - for (IcebergFileScanTaskSplit pendingSplit = pendingSplits.poll(); - pendingSplit != null; - pendingSplit = pendingSplits.poll()) { - currentReadSplit = pendingSplit; - try (CloseableIterator rowIterator = - icebergFileScanTaskSplitReader.open(currentReadSplit)) { - while (rowIterator.hasNext()) { - output.collect(rowIterator.next()); + synchronized (output.getCheckpointLock()) { + currentReadSplit = pendingSplits.poll(); + if (currentReadSplit != null) { + IcebergFileScanTaskSplitReader tableReader = + getOrCreateTableReader(currentReadSplit.getTablePath()); + try (CloseableIterator rowIterator = + tableReader.open(currentReadSplit)) { + while (rowIterator.hasNext()) { + output.collect(rowIterator.next()); + } } + return; } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/split/IcebergFileScanTaskSplit.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/split/IcebergFileScanTaskSplit.java index 2b3870680d9..a476079404a 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/split/IcebergFileScanTaskSplit.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/split/IcebergFileScanTaskSplit.java @@ -18,6 +18,7 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.source.split; import org.apache.seatunnel.api.source.SourceSplit; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.iceberg.FileScanTask; @@ -36,11 +37,18 @@ public class IcebergFileScanTaskSplit implements SourceSplit { private static final long serialVersionUID = -9043797960947110643L; + private final TablePath tablePath; private final FileScanTask task; @Setter private volatile long recordOffset; + public IcebergFileScanTaskSplit(TablePath tablePath, @NonNull FileScanTask task) { + this(tablePath, task, 0); + } + + // TODO: Waiting for old version migration to complete before remove + @Deprecated public IcebergFileScanTaskSplit(@NonNull FileScanTask task) { - this(task, 0); + this(null, task, 0); } @Override diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java index 1fbfd7c095e..1fb57f3e9f5 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java @@ -73,6 +73,11 @@ public String build(TablePath tablePath) { buildColumnSql(column), fieldIde)) .collect(Collectors.toList()); + // add primary key + if (createIndex && primaryKey != null) { + columnSqls.add("\t" + buildPrimaryKeySql()); + } + if (createIndex && CollectionUtils.isNotEmpty(constraintKeys)) { for (ConstraintKey constraintKey : constraintKeys) { if (StringUtils.isBlank(constraintKey.getConstraintName()) @@ -134,14 +139,6 @@ private String buildColumnSql(Column column) { if (!column.isNullable()) { columnSql.append(" NOT NULL"); } - - // Add primary key directly after the column if it is a primary key - if (createIndex - && primaryKey != null - && primaryKey.getColumnNames().contains(column.getName())) { - columnSql.append(" PRIMARY KEY"); - } - return columnSql.toString(); } @@ -163,6 +160,19 @@ private String buildColumnCommentSql(Column column, String tableName) { return columnCommentSql.toString(); } + private String buildPrimaryKeySql() { + String constraintName = UUID.randomUUID().toString().replace("-", ""); + String primaryKeyColumns = + primaryKey.getColumnNames().stream() + .map( + column -> + String.format( + "\"%s\"", + CatalogUtils.getFieldIde(column, fieldIde))) + .collect(Collectors.joining(",")); + return "CONSTRAINT \"" + constraintName + "\" PRIMARY KEY (" + primaryKeyColumns + ")"; + } + private String buildUniqueKeySql(ConstraintKey constraintKey) { String constraintName = UUID.randomUUID().toString().replace("-", ""); String indexColumns = diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilderTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilderTest.java index 03b99b1ca0a..cc820a4ed3d 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilderTest.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilderTest.java @@ -52,9 +52,10 @@ void build() { catalogTable.getTableId().toTablePath()); String pattern = "CREATE TABLE \"test\" \\(\n" - + "\"id\" int4 NOT NULL PRIMARY KEY,\n" + + "\"id\" int4 NOT NULL,\n" + "\"name\" text NOT NULL,\n" + "\"age\" int4 NOT NULL,\n" + + "\tCONSTRAINT \"([a-zA-Z0-9]+)\" PRIMARY KEY \\(\"id\",\"name\"\\),\n" + "\tCONSTRAINT \"([a-zA-Z0-9]+)\" UNIQUE \\(\"name\"\\)\n" + "\\);"; Assertions.assertTrue( @@ -142,7 +143,7 @@ private CatalogTable catalogTable(boolean otherDB) { TableSchema tableSchema = TableSchema.builder() .columns(columns) - .primaryKey(PrimaryKey.of("pk_id", Lists.newArrayList("id"))) + .primaryKey(PrimaryKey.of("pk_id_name", Lists.newArrayList("id", "name"))) .constraintKey( Lists.newArrayList( ConstraintKey.of( diff --git a/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigShadeUtils.java b/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigShadeUtils.java index 2772454efb2..f3a3013a066 100644 --- a/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigShadeUtils.java +++ b/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigShadeUtils.java @@ -46,6 +46,7 @@ public final class ConfigShadeUtils { private static final String SHADE_IDENTIFIER_OPTION = "shade.identifier"; + private static final String SHADE_PROPS_OPTION = "shade.properties"; public static final String[] DEFAULT_SENSITIVE_KEYWORDS = new String[] {"password", "username", "auth", "token", "access_key", "secret_key"}; @@ -101,7 +102,14 @@ public static Config decryptConfig(Config config) { : ConfigFactory.empty(), SHADE_IDENTIFIER_OPTION, DEFAULT_SHADE.getIdentifier()); - return decryptConfig(identifier, config); + Map props = + TypesafeConfigUtils.getConfig( + config.hasPath(Constants.ENV) + ? config.getConfig(Constants.ENV) + : ConfigFactory.empty(), + SHADE_PROPS_OPTION, + new HashMap<>()); + return decryptConfig(identifier, config, props); } public static Config encryptConfig(Config config) { @@ -112,20 +120,33 @@ public static Config encryptConfig(Config config) { : ConfigFactory.empty(), SHADE_IDENTIFIER_OPTION, DEFAULT_SHADE.getIdentifier()); - return encryptConfig(identifier, config); + Map props = + TypesafeConfigUtils.getConfig( + config.hasPath(Constants.ENV) + ? config.getConfig(Constants.ENV) + : ConfigFactory.empty(), + SHADE_PROPS_OPTION, + new HashMap<>()); + return encryptConfig(identifier, config, props); } - public static Config decryptConfig(String identifier, Config config) { - return processConfig(identifier, config, true); + private static Config decryptConfig( + String identifier, Config config, Map props) { + return processConfig(identifier, config, true, props); } - public static Config encryptConfig(String identifier, Config config) { - return processConfig(identifier, config, false); + private static Config encryptConfig( + String identifier, Config config, Map props) { + return processConfig(identifier, config, false, props); } @SuppressWarnings("unchecked") - private static Config processConfig(String identifier, Config config, boolean isDecrypted) { + private static Config processConfig( + String identifier, Config config, boolean isDecrypted, Map props) { ConfigShade configShade = CONFIG_SHADES.getOrDefault(identifier, DEFAULT_SHADE); + // call open method before the encrypt/decrypt + configShade.open(props); + List sensitiveOptions = new ArrayList<>(Arrays.asList(DEFAULT_SENSITIVE_KEYWORDS)); sensitiveOptions.addAll(Arrays.asList(configShade.sensitiveOptions())); BiFunction processFunction = diff --git a/seatunnel-core/seatunnel-core-starter/src/test/java/org/apache/seatunnel/core/starter/utils/ConfigShadeTest.java b/seatunnel-core/seatunnel-core-starter/src/test/java/org/apache/seatunnel/core/starter/utils/ConfigShadeTest.java index b62df816083..4cd8b9c8717 100644 --- a/seatunnel-core/seatunnel-core-starter/src/test/java/org/apache/seatunnel/core/starter/utils/ConfigShadeTest.java +++ b/seatunnel-core/seatunnel-core-starter/src/test/java/org/apache/seatunnel/core/starter/utils/ConfigShadeTest.java @@ -41,6 +41,7 @@ import java.util.ArrayList; import java.util.Base64; import java.util.List; +import java.util.Map; import static org.apache.seatunnel.core.starter.utils.ConfigBuilder.CONFIG_RENDER_OPTIONS; @@ -274,6 +275,55 @@ public void testDecryptAndEncrypt() { Assertions.assertEquals(decryptPassword, PASSWORD); } + @Test + public void testDecryptWithProps() throws URISyntaxException { + URL resource = ConfigShadeTest.class.getResource("/config.shade_with_props.json"); + Assertions.assertNotNull(resource); + Config decryptedProps = ConfigBuilder.of(Paths.get(resource.toURI()), Lists.newArrayList()); + + String suffix = "666"; + String rawUsername = "un"; + String rawPassword = "pd"; + Assertions.assertEquals( + rawUsername, decryptedProps.getConfigList("source").get(0).getString("username")); + Assertions.assertEquals( + rawPassword, decryptedProps.getConfigList("source").get(0).getString("password")); + + Config encryptedConfig = ConfigShadeUtils.encryptConfig(decryptedProps); + Assertions.assertEquals( + rawUsername + suffix, + encryptedConfig.getConfigList("source").get(0).getString("username")); + Assertions.assertEquals( + rawPassword + suffix, + encryptedConfig.getConfigList("source").get(0).getString("password")); + } + + public static class ConfigShadeWithProps implements ConfigShade { + + private String suffix; + private String identifier = "withProps"; + + @Override + public void open(Map props) { + this.suffix = String.valueOf(props.get("suffix")); + } + + @Override + public String getIdentifier() { + return identifier; + } + + @Override + public String encrypt(String content) { + return content + suffix; + } + + @Override + public String decrypt(String content) { + return content.substring(0, content.length() - suffix.length()); + } + } + public static class Base64ConfigShade implements ConfigShade { private static final Base64.Encoder ENCODER = Base64.getEncoder(); diff --git a/seatunnel-core/seatunnel-core-starter/src/test/resources/META-INF/services/org.apache.seatunnel.api.configuration.ConfigShade b/seatunnel-core/seatunnel-core-starter/src/test/resources/META-INF/services/org.apache.seatunnel.api.configuration.ConfigShade index 6d7378028f9..87b02ff318b 100644 --- a/seatunnel-core/seatunnel-core-starter/src/test/resources/META-INF/services/org.apache.seatunnel.api.configuration.ConfigShade +++ b/seatunnel-core/seatunnel-core-starter/src/test/resources/META-INF/services/org.apache.seatunnel.api.configuration.ConfigShade @@ -13,4 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.seatunnel.core.starter.utils.ConfigShadeTest$Base64ConfigShade \ No newline at end of file +org.apache.seatunnel.core.starter.utils.ConfigShadeTest$Base64ConfigShade +org.apache.seatunnel.core.starter.utils.ConfigShadeTest$ConfigShadeWithProps \ No newline at end of file diff --git a/seatunnel-core/seatunnel-core-starter/src/test/resources/config.shade_with_props.json b/seatunnel-core/seatunnel-core-starter/src/test/resources/config.shade_with_props.json new file mode 100644 index 00000000000..c6f48bf6f7e --- /dev/null +++ b/seatunnel-core/seatunnel-core-starter/src/test/resources/config.shade_with_props.json @@ -0,0 +1,44 @@ +{ + "env" : { + "shade.identifier" : "withProps", + "parallelism" : 1, + "shade.properties" : { + "suffix" : "666" + } + }, + "source" : [ + { + "plugin_name" : "MySQL-CDC", + "base-url" : "jdbc:mysql://localhost:56725", + "username" : "un666", + "password" : "pd666", + "hostname" : "127.0.0.1", + "port" : 56725, + "database-name" : "inventory_vwyw0n", + "parallelism" : 1, + "table-name" : "products", + "server-id" : 5656, + "schema" : { + "fields" : { + "name" : "string", + "age" : "int", + "sex" : "boolean" + } + }, + "plugin_output" : "fake" + } + ], + "transform" : [], + "sink" : [ + { + "plugin_name" : "Clickhouse", + "host" : "localhost:8123", + "username" : "un666", + "password" : "pd666", + "database" : "default", + "table" : "fake_all", + "support_upsert" : true, + "primary_key" : "id" + } + ] +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/java/org/apache/seatunnel/e2e/connector/elasticsearch/ElasticsearchIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/java/org/apache/seatunnel/e2e/connector/elasticsearch/ElasticsearchIT.java index 87730fee46c..fa805d851b9 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/java/org/apache/seatunnel/e2e/connector/elasticsearch/ElasticsearchIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/java/org/apache/seatunnel/e2e/connector/elasticsearch/ElasticsearchIT.java @@ -133,6 +133,7 @@ public void startUp() throws Exception { createIndexDocs(); createIndexWithFullType(); createIndexForResourceNull("st_index4"); + createIndexWithNestType(); } /** create a index,and bulk some documents */ @@ -156,6 +157,31 @@ private void createIndexDocsByName(String indexName, List testDataSet) { esRestClient.bulk(requestBody.toString()); } + private void createIndexWithNestType() throws IOException, InterruptedException { + String mapping = + IOUtils.toString( + ContainerUtil.getResourcesFile("/elasticsearch/st_index_nest_mapping.json") + .toURI(), + StandardCharsets.UTF_8); + esRestClient.createIndex("st_index_nest", mapping); + esRestClient.createIndex("st_index_nest_copy", mapping); + BulkResponse response = + esRestClient.bulk( + "{ \"index\" : { \"_index\" : \"st_index_nest\", \"_id\" : \"1\" } }\n" + + IOUtils.toString( + ContainerUtil.getResourcesFile( + "/elasticsearch/st_index_nest_data.json") + .toURI(), + StandardCharsets.UTF_8) + .replace("\n", "") + + "\n"); + Assertions.assertFalse(response.isErrors(), response.getResponse()); + // waiting index refresh + Thread.sleep(INDEX_REFRESH_MILL_DELAY); + Assertions.assertEquals( + 3, esRestClient.getIndexDocsCount("st_index_nest").get(0).getDocsCount()); + } + private void createIndexWithFullType() throws IOException, InterruptedException { String mapping = IOUtils.toString( @@ -202,6 +228,21 @@ public void testElasticsearchWithSchema(TestContainer container) Assertions.assertIterableEquals(mapTestDatasetForDSL(), sinkData); } + @TestTemplate + public void testElasticsearchWithNestSchema(TestContainer container) + throws IOException, InterruptedException { + Container.ExecResult execResult = + container.executeJob("/elasticsearch/elasticsearch_source_and_sink_with_nest.conf"); + Assertions.assertEquals(0, execResult.getExitCode()); + + List sinkData = readSinkDataWithNestSchema("st_index_nest_copy"); + String data = + "{\"address\":[{\"zipcode\":\"10001\",\"city\":\"New York\",\"street\":\"123 Main St\"}," + + "{\"zipcode\":\"90001\",\"city\":\"Los Angeles\",\"street\":\"456 Elm St\"}],\"name\":\"John Doe\"}"; + + Assertions.assertIterableEquals(Lists.newArrayList(data), sinkData); + } + @TestTemplate public void testElasticsSearchWithMultiSourceByFilter(TestContainer container) throws InterruptedException, IOException { @@ -546,6 +587,13 @@ private List readSinkDataWithSchema(String index) throws InterruptedExce return getDocsWithTransformTimestamp(source, index); } + private List readSinkDataWithNestSchema(String index) throws InterruptedException { + // wait for index refresh + Thread.sleep(INDEX_REFRESH_MILL_DELAY); + List source = Lists.newArrayList("name", "address"); + return getDocsWithNestType(source, index); + } + private List readMultiSinkData(String index, List source) throws InterruptedException { // wait for index refresh @@ -604,6 +652,25 @@ private List getDocsWithTransformTimestamp(List source, String i return docs; } + private List getDocsWithNestType(List source, String index) { + Map query = new HashMap<>(); + query.put("match_all", new HashMap<>()); + ScrollResult scrollResult = esRestClient.searchByScroll(index, source, query, "1m", 1000); + scrollResult + .getDocs() + .forEach( + x -> { + x.remove("_index"); + x.remove("_type"); + x.remove("_id"); + }); + List docs = + scrollResult.getDocs().stream() + .map(JsonUtils::toJsonString) + .collect(Collectors.toList()); + return docs; + } + private List getDocsWithTransformDate(List source, String index) { return getDocsWithTransformDate(source, index, Collections.emptyList()); } @@ -739,6 +806,13 @@ private List mapTestDatasetForDSL(List testDataset) { .collect(Collectors.toList()); } + private List mapTestDatasetForNest(List testDataset) { + return testDataset.stream() + .map(JsonUtils::parseObject) + .map(JsonNode::toString) + .collect(Collectors.toList()); + } + /** * Use custom filtering criteria to query data * diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/elasticsearch_source_and_sink_with_nest.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/elasticsearch_source_and_sink_with_nest.conf new file mode 100644 index 00000000000..6b07c9b80f2 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/elasticsearch_source_and_sink_with_nest.conf @@ -0,0 +1,53 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + parallelism = 1 + job.mode = "BATCH" + #checkpoint.interval = 10000 +} + +source { +Elasticsearch { + hosts = ["https://elasticsearch:9200"] + username = "elastic" + password = "elasticsearch" + index = "st_index_nest" + source = ["address","name"] + query = {"match_all": {}} + tls_verify_certificate = false + tls_verify_hostname = false + } +} + +transform { +} + +sink { + Elasticsearch { + hosts = ["https://elasticsearch:9200"] + username = "elastic" + password = "elasticsearch" + index = "st_index_nest_copy" + tls_verify_certificate = false + tls_verify_hostname = false + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_data.json b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_data.json new file mode 100644 index 00000000000..b63bdf962f0 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_data.json @@ -0,0 +1,15 @@ +{ + "name": "John Doe", + "address": [ + { + "street": "123 Main St", + "city": "New York", + "zipcode": "10001" + }, + { + "street": "456 Elm St", + "city": "Los Angeles", + "zipcode": "90001" + } + ] +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_mapping.json b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_mapping.json new file mode 100644 index 00000000000..1b4d15a1023 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-elasticsearch-e2e/src/test/resources/elasticsearch/st_index_nest_mapping.json @@ -0,0 +1,23 @@ +{ + "mappings": { + "properties": { + "name": { + "type": "text" + }, + "address": { + "type": "nested", + "properties": { + "street": { + "type": "text" + }, + "city": { + "type": "keyword" + }, + "zipcode": { + "type": "keyword" + } + } + } + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileWithMultipleTableIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileWithMultipleTableIT.java index 4c63b7e3357..5303d4a6629 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileWithMultipleTableIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileWithMultipleTableIT.java @@ -65,6 +65,11 @@ public class LocalFileWithMultipleTableIT extends TestSuiteBase { "/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt", container); + ContainerUtil.copyFileIntoContainers( + "/binary/cat.png", + "/seatunnel/read/binary/name=tyrantlucifer/hobby=coding/cat.png", + container); + container.execInContainer("mkdir", "-p", "/tmp/fake_empty"); }; @@ -109,4 +114,11 @@ public void testLocalFileReadAndWriteInMultipleTableMode_text(TestContainer cont TestHelper helper = new TestHelper(container); helper.execute("/text/local_file_text_to_assert_with_multipletable.conf"); } + + @TestTemplate + public void testLocalFileReadAndWriteInMultipleTableMode_binary(TestContainer container) + throws IOException, InterruptedException { + TestHelper helper = new TestHelper(container); + helper.execute("/binary/local_file_binary_to_local_file_binary_with_multipletable.conf"); + } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/binary/local_file_binary_to_local_file_binary_with_multipletable.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/binary/local_file_binary_to_local_file_binary_with_multipletable.conf new file mode 100644 index 00000000000..c97dd83340f --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/binary/local_file_binary_to_local_file_binary_with_multipletable.conf @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" +} + +source { + LocalFile { + tables_configs = [ + { + schema { + table = "cat" + } + path = "/seatunnel/read/binary" + file_format_type = "binary" + }, + { + schema { + table = "dog" + } + path = "/seatunnel/read/binary" + file_format_type = "binary" + } + + ] + } +} +sink { + Assert { + rules { + table-names = ["cat", "dog"] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java index 2ac185aabbd..235f39ae38d 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java @@ -113,6 +113,17 @@ public void startUp() throws Exception { "/home/seatunnel/tmp/seatunnel/read/xml/name=tyrantlucifer/hobby=coding/e2e.xml", sftpContainer); + // Windows does not support files with wildcard characters. We can rename `e2e.txt` to + // `e*e.txt` when copying to a container + ContainerUtil.copyFileIntoContainers( + "/text/e2e.txt", + "/home/seatunnel/tmp/seatunnel/read/wildcard/e*e.txt", + sftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/text/e2e.txt", + "/home/seatunnel/tmp/seatunnel/read/wildcard/e2e.txt", + sftpContainer); sftpContainer.execInContainer("sh", "-c", "chown -R seatunnel /home/seatunnel/tmp/"); } @@ -138,6 +149,9 @@ public void testSftpFileReadAndWrite(TestContainer container) helper.execute("/text/sftp_file_text_projection_to_assert.conf"); // test read sftp zip text file helper.execute("/text/sftp_file_zip_text_to_assert.conf"); + // test read file wit wildcard character, should match tmp/seatunnel/read/wildcard/e*e.txt + // and tmp/seatunnel/read/wildcard/e2e.txt + helper.execute("/text/sftp_file_text_wildcard_character_to_assert.conf"); // test write sftp json file helper.execute("/json/fake_to_sftp_file_json.conf"); // test read sftp json file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/text/sftp_file_text_wildcard_character_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/text/sftp_file_text_wildcard_character_to_assert.conf new file mode 100644 index 00000000000..cd8e27b743e --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/text/sftp_file_text_wildcard_character_to_assert.conf @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + SftpFile { + host = "sftp" + port = 22 + user = seatunnel + password = pass + path = "tmp/seatunnel/read/wildcard/" + file_format_type = "text" + plugin_output = "sftp" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Assert { + plugin_input = "sftp" + rules { + row_rules = [ + { + rule_type = MIN_ROW + rule_value = 10 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java index 87eec5834b9..b7850ddfc7e 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java @@ -147,6 +147,7 @@ private void initializeIcebergTable() { configs.put(CommonConfig.KEY_CATALOG_NAME.key(), CATALOG_NAME); configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + configs.put(CommonConfig.KEY_TABLE.key(), TABLE.toString()); CATALOG = new IcebergCatalogLoader(new SourceConfig(ReadonlyConfig.fromMap(configs))) diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf index 351f5a58c03..fcec73e5d01 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf @@ -50,7 +50,11 @@ source { "warehouse"="file:///tmp/seatunnel/iceberg/hadoop/" } namespace = "database1" - table = "source" + table_list = [ + { + table = "source" + } + ] plugin_output = "iceberg" } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java index 27eb102866c..4aab6f5aaf1 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java @@ -148,6 +148,7 @@ private void initializeIcebergTable() { configs.put(CommonConfig.KEY_CATALOG_NAME.key(), CATALOG_NAME); configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + configs.put(CommonConfig.KEY_TABLE.key(), TABLE.toString()); ReadonlyConfig readonlyConfig = ReadonlyConfig.fromMap(configs); CATALOG = new IcebergCatalogLoader(new SourceConfig(readonlyConfig)).loadCatalog(); diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-s3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/s3/IcebergSourceIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-s3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/s3/IcebergSourceIT.java index 35101528929..2a23708aede 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-s3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/s3/IcebergSourceIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-s3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/s3/IcebergSourceIT.java @@ -230,6 +230,7 @@ private void initializeIcebergTable() { configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); configs.put(CommonConfig.HADOOP_PROPS.key(), getHadoopProps()); + configs.put(CommonConfig.KEY_TABLE.key(), TABLE.toString()); ReadonlyConfig readonlyConfig = ReadonlyConfig.fromMap(configs); CATALOG = new IcebergCatalogLoader(new SourceConfig(readonlyConfig)).loadCatalog(); diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-starrocks-e2e/src/test/java/org/apache/seatunnel/e2e/connector/starrocks/StarRocksIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-starrocks-e2e/src/test/java/org/apache/seatunnel/e2e/connector/starrocks/StarRocksIT.java index a575da7862a..1e984205ae9 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-starrocks-e2e/src/test/java/org/apache/seatunnel/e2e/connector/starrocks/StarRocksIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-starrocks-e2e/src/test/java/org/apache/seatunnel/e2e/connector/starrocks/StarRocksIT.java @@ -362,7 +362,7 @@ public void testCatalog() { "root", PASSWORD, String.format(URL, starRocksServer.getHost()), - "CREATE TABLE IF NOT EXISTS `${database}`.`${table}` (\n ${rowtype_fields}\n ) ENGINE=OLAP \n DUPLICATE KEY(`BIGINT_COL`) \n DISTRIBUTED BY HASH (BIGINT_COL) BUCKETS 1 \n PROPERTIES (\n \"replication_num\" = \"1\", \n \"in_memory\" = \"false\" , \n \"storage_format\" = \"DEFAULT\" \n )"); + "CREATE TABLE IF NOT EXISTS `${database}`.`${table}` (\n ${rowtype_fields}\n ) ENGINE=OLAP \n DUPLICATE KEY(`BIGINT_COL`) \n COMMENT '${comment}' \n DISTRIBUTED BY HASH (BIGINT_COL) BUCKETS 1 \n PROPERTIES (\n \"replication_num\" = \"1\", \n \"in_memory\" = \"false\" , \n \"storage_format\" = \"DEFAULT\" \n )"); starRocksCatalog.open(); CatalogTable catalogTable = starRocksCatalog.getTable(tablePathStarRocksSource); // sink tableExists ? diff --git a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/ModelProvider.java b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/ModelProvider.java index 31721377062..f18ffdfc8e1 100644 --- a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/ModelProvider.java +++ b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/ModelProvider.java @@ -26,6 +26,7 @@ public enum ModelProvider { "https://ark.cn-beijing.volces.com/api/v3/embeddings"), QIANFAN("", "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings"), KIMIAI("https://api.moonshot.cn/v1/chat/completions", ""), + DEEPSEEK("https://api.deepseek.com/chat/completions", ""), MICROSOFT("", ""), CUSTOM("", ""), LOCAL("", ""); diff --git a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java index c99b03776e9..346fd688084 100644 --- a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java +++ b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/nlpmodel/llm/LLMTransform.java @@ -106,6 +106,7 @@ public void open() { config.get(LLMTransformConfig.API_KEY), provider.usedLLMPath(config.get(LLMTransformConfig.API_PATH))); break; + case DEEPSEEK: case OPENAI: case DOUBAO: model = diff --git a/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/InternalRowConverter.java b/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/InternalRowConverter.java index 1521409f594..f68c2892f9d 100644 --- a/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/InternalRowConverter.java +++ b/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/InternalRowConverter.java @@ -116,6 +116,18 @@ private static Object convert(Object field, SeaTunnelDataType dataType) { case ARRAY: Class elementTypeClass = ((ArrayType) dataType).getElementType().getTypeClass(); + + if (((ArrayType) dataType).getElementType() instanceof MapType) { + Object arrayMap = + Array.newInstance(ArrayBasedMapData.class, ((Map[]) field).length); + for (int i = 0; i < ((Map[]) field).length; i++) { + Map value = (Map) ((Map[]) field)[i]; + MapType type = + (MapType) ((ArrayType) dataType).getElementType(); + Array.set(arrayMap, i, convertMap(value, type)); + } + return ArrayData.toArrayData(arrayMap); + } // if string array, we need to covert every item in array from String to UTF8String if (((ArrayType) dataType).getElementType().equals(BasicType.STRING_TYPE)) { Object[] fields = (Object[]) field;