功能描述
将表或查询结果导出到指定路径。您可以将一张表或查询结果导出为文件并保存到 Volume 的指定路径。请注意,如果指定路径下已存在同名文件,则该文件将会被覆盖。
注意事项
- 默认文件名:Lakehouse 导出时,默认文件名是相同的。因此,如果您多次运行导出操作并且指定了相同的子目录(SUBDIRECTORY),新文件将覆盖上次导出的文件。
- 避免覆盖:为了防止意外覆盖现有文件,请确保每次导出时使用唯一的文件名或子目录。
语法
---语法
COPY INTO { VOLUME external_volume_name | TABLE VOLUME table_name | USER VOLUME }
SUBDIRECTORY '<path>'
FROM { [schema.]<table_name> | ( <query> ) }
FILE_FORMAT = ( TYPE = { CSV | PARQUET | JSON } [ formatTypeOptions ] )
[ copyOptions ]
参数说明
--直接指定表
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--将SQL结果集导出
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM (select * from birds limit 1)
file_format = (type = CSV);
-
formatTypeOptions:文件格式,支持CSV、TEXT、PARQUET、JSON。其中JSON导出的格式是JSON LINE
-
CSV 格式支持的参数:
--指定分隔符为|和压缩
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= CSV
sep='|'
compression='gzip'
);
- headerQuotingStyle:默认值为 "needed",可选值 "none"。"needed" 始终给header两边加引号,"none"不加引号,遇到控制字符时报错。
- dataQuotingStyle:默认值为 "needed",可选值 "none-unsafe"。"needed"始终给header两边加引号,"none-unsafe" 始终不加引号,且遇到控制字符不报错(不符合 CSV 规范但我们支持)
-
JSON格式支持的参数:
-
Parquet , ORC , BSON 格式:无
-
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= parquet
);
-
copyOptions
overwrite=true
overwrite=true
:清空目标文件夹后导入(含子目录)。特别注意会清空子目录。
- filename_prefix = '<prefex_name>'。可选参数。设置文件前缀,例如:filename_prefix = 'my_prefix_'
--给文件添加前缀
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
filename_prefix='birds'
;
--查看目录,如第一行添加了前缀birds
SHOW VOLUME DIRECTORY my_volume SUBDIRECTORY 'dau_unload/';
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
- filename_suffix = '<suffix>'。参数。设置文件后缀,例如:filename_suffix = '.data'
--添加后缀
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
filename_suffix='.data';
--查看目录,如第四行添加了前缀birds
SHOW VOLUME DIRECTORY my_volume SUBDIRECTORY 'dau_unload/';
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.data | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.data | 295 | 2024-12-27 17:33:49 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
+-------------------------------+---------------------------------------------------------------------+------+---------------------+
- include_job_id = 'TRUE' | 'FALSE'。 可选参数。设置文件名是否写入作业ID,不设置时默认为不写入作业ID。例如:include_job_id = 'TRUE'
--导出的文件包含jobid
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= json
)
include_job_id = 'TRUE' ;
--查看目录,如第8行包含了导出的文件包含了jobid
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
| relative_path | url | size | last_modified_time |
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
| dau_unload/birds00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/birds00001.json | 295 | 2024-12-27 17:29:20 |
| dau_unload/part00001.csv | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv | 105 | 2024-12-27 17:15:40 |
| dau_unload/part00001.csv.gzip | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.csv.gzip | 104 | 2024-12-27 17:19:33 |
| dau_unload/part00001.data | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.data | 295 | 2024-12-27 17:33:49 |
| dau_unload/part00001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.json | 295 | 2024-12-27 17:24:26 |
| dau_unload/part00001.parquet | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.parquet | 1886 | 2024-12-27 17:27:15 |
| dau_unload/part00001.text | oss://lakehouse-perf-test/test_insert/dau_unload/part00001.text | 87 | 2024-12-27 17:25:34 |
| dau_unload/part202412271736045501gmspelya5o900001.json | oss://lakehouse-perf-test/test_insert/dau_unload/part202412271736045501gmspelya5o900001.json | 295 | 2024-12-27 17:36:04 |
+--------------------------------------------------------+----------------------------------------------------------------------------------------------+------+---------------------+
- single = true | false,是否导出为单个文件。
true
true
导出单文件,false
false
导出多文件,默认值是false
false
。导出为单文件时会涉及数据合并操作,对于大数据集可能需要较多内存和时间,但不会改变最终的 I/O 吞吐量
-- 默认导出为 data.csv
COPY INTO USER VOLUME SUBDIRECTORY 'export_single/'
FROM TABLE birds
file_format = (type = CSV)
single = true;
-- 验证结果
SHOW USER VOLUME DIRECTORY SUBDIRECTORY 'export_single/';
使用示例
导出数据到user volume中
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
COPY INTO USER VOLUME SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--查看是否导出成功
SHOW USER VOLUME DIRECTORY SUBDIRECTORY 'dau_unload/';
--删除文件避免占用存储
REMOVE VOLUME my_volume FILE 'dau_unload/part00001.csv';
导出数据到table volume中
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
COPY INTO TABLE VOLUME birds SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
--查看是否导出成功
SHOW TABLE VOLUME DIRECTORY birds SUBDIRECTORY 'dau_unload/';
--删除文件避免占用存储
REMOVE TABLE VOLUME birds FILE 'dau_unload/part00001.csv';
导出到外部volume中使用前提需要创建VOLUME和CONNECTION。创建过程可以参考CONNECTION创建和VOLUME创建
导出数据到oss中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建oss conenction
CREATE STORAGE CONNECTION catalog_storage_oss
type OSS
ACCESS_ID='xxxx'
ACCESS_KEY='xxxxxxx'
ENDPOINT='oss-cn-hangzhou-internal.aliyuncs.com';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 'oss://mybucket/test_insert/'
using connection catalog_storage_oss
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出数据到cos中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建cos conenction
CREATE STORAGE CONNECTION my_conn
TYPE COS
ACCESS_KEY = '<access_key>'
SECRET_KEY = '<secret_key>'
REGION = 'ap-shanghai'
APP_ID = '1310000503';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 'cos://mybucket/test_insert/'
using connection my_conn
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出数据到s3中
--创建表
CREATE TABLE birds (
id INT,
name VARCHAR(50),
wingspan_cm FLOAT,
colors STRING
);
INSERT INTO birds (id, name, wingspan_cm, colors) VALUES
(1, 'Sparrow', 15.5, 'Brown'),
(2, 'Blue Jay', 20.2, 'Blue'),
(3, 'Cardinal', 22.1, 'Red'),
(4, 'Robin', 18.7, 'Red","Brown');
--创建s3 conenction
CREATE STORAGE CONNECTION aws_bj_conn
TYPE S3
ACCESS_KEY = 'AKIAQNBSBP6EIJE33***'
SECRET_KEY = '7kfheDrmq***************************'
ENDPOINT = 's3.cn-north-1.amazonaws.com.cn'
REGION = 'cn-north-1';
--创建volume
CREATE EXTERNAL VOLUME my_volume
location 's3://mybucket/test_insert/'
using connection aws_bj_conn
directory = (
enable=true,
auto_refresh=true
);
--将数据导出到test_insert子目录下
COPY INTO VOLUME my_volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (type = CSV);
导出EXCEL兼容格式避免乱码,并下载到本地
COPY INTO user volume SUBDIRECTORY 'dau_unload/'
FROM TABLE birds
file_format = (
type= CSV
writebom=true
);