-- 注释语法
CREATE TABLE orders (
id INT COMMENT '主键',
name STRING COMMENT '姓名'
) COMMENT '订单表';
-- NOT NULL 约束
CREATE TABLE users (id INT NOT NULL, name STRING);
-- 默认值
CREATE TABLE tasks (id INT, status STRING DEFAULT 'pending');
-- 生成列
CREATE TABLE order_items (
id INT, price DOUBLE, quantity INT,
total DOUBLE GENERATED ALWAYS AS (price * quantity)
);
-- 表属性
CREATE TABLE events (id INT) TBLPROPERTIES ('key1' = 'value1');
-- 条件建表
CREATE TABLE IF NOT EXISTS backup_table AS SELECT * FROM original_table;
CTAS(Create Table As Select)语法在 Lakehouse 中基本兼容,但有以下差异:
USING
USING
关键字可选
-- Spark
CREATE TABLE orders_ctas
USING parquet
AS SELECT * FROM orders;
-- Lakehouse(USING 可保留,也可省略)
CREATE TABLE orders_ctas
AS SELECT * FROM orders;
CTAS 不支持分区子句
Lakehouse 的 CTAS 语法不支持
PARTITIONED BY
PARTITIONED BY
子句。如需创建分区表,应先建表再插入数据:
-- 错误写法:CTAS 不支持 PARTITIONED BY
CREATE TABLE orders_partitioned
PARTITIONED BY (order_date)
AS SELECT * FROM orders;
-- 正确写法:先建表,再插入
CREATE TABLE orders_partitioned (
order_id INT,
customer_id INT,
amount DOUBLE,
order_date STRING
) PARTITIONED BY (order_date);
INSERT INTO orders_partitioned
SELECT * FROM orders;
分桶表
分桶表语法与 Spark 完全一致,无需修改:
-- Spark 和 Lakehouse 写法完全一致
CREATE TABLE users (
id INT,
name STRING
) CLUSTERED BY (id) INTO 16 BUCKETS;
-- 完全兼容
UPDATE orders SET status = 'shipped' WHERE id = 1;
DELETE FROM orders WHERE status = 'cancelled';
MERGE INTO
-- 完全兼容
MERGE INTO orders AS target
USING staging_orders AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.amount = source.amount
WHEN NOT MATCHED THEN INSERT (id, amount, order_date) VALUES (source.id, source.amount, source.order_date);
SELECT 查询
完全兼容的语法
以下 Spark SQL 常用语法在 Lakehouse 中完全兼容,无需修改:
语法
示例
状态
反引号引用列名
SELECT `id`, `name` FROM t
SELECT `id`, `name` FROM t
✅
字符串连接
`SELECT 'Hello'
CASE WHEN
CASE WHEN x > 0 THEN 'positive' END
CASE WHEN x > 0 THEN 'positive' END
✅
GROUP BY 位置引用
GROUP BY 1, 2
GROUP BY 1, 2
✅
ORDER BY 位置引用
ORDER BY 1 DESC
ORDER BY 1 DESC
✅
HAVING 使用别名
HAVING total > 100
HAVING total > 100
✅
LIMIT
LIMIT 10
LIMIT 10
✅
VALUES 子句
SELECT * FROM VALUES (1, 'a'), (2, 'b') AS t(id, name)
SELECT * FROM VALUES (1, 'a'), (2, 'b') AS t(id, name)
✅
JOIN ... USING
JOIN b USING (id)
JOIN b USING (id)
✅
CROSS JOIN
CROSS JOIN
CROSS JOIN
✅
RLIKE / REGEXP
'abc' RLIKE '[a-z]+$'
'abc' RLIKE '[a-z]+$'
✅
NULLIF / NVL
NULLIF(a, b)
NULLIF(a, b)
,
NVL(col, 'default')
NVL(col, 'default')
✅
EXCEPT / INTERSECT
SELECT ... EXCEPT SELECT ...
SELECT ... EXCEPT SELECT ...
✅
QUALIFY
QUALIFY ROW_NUMBER() = 1
QUALIFY ROW_NUMBER() = 1
✅
隐式类型转换
WHERE str_col = 123
WHERE str_col = 123
✅
除以零
SELECT 1/0
SELECT 1/0
→
NULL
NULL
✅
相关子查询
Lakehouse 完全支持相关子查询(Correlated Subquery),包括
EXISTS
EXISTS
/
NOT EXISTS
NOT EXISTS
中引用外部列,语法与 Spark 一致,无需修改:
-- 完全兼容,无需改写
SELECT id FROM orders a
WHERE EXISTS (SELECT 1 FROM customers b WHERE b.id = a.customer_id);
SELECT id FROM orders a
WHERE NOT EXISTS (SELECT 1 FROM blacklist b WHERE b.order_id = a.id);
窗口函数
Lakehouse 完全兼容 Spark SQL 的窗口函数,语法与 Spark 完全一致:
SELECT
id,
amount,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at) AS rn,
SUM(amount) OVER (PARTITION BY user_id ORDER BY created_at ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM orders;
支持的窗口函数:
ROW_NUMBER()
ROW_NUMBER()
、
RANK()
RANK()
、
DENSE_RANK()
DENSE_RANK()
、
LAG()
LAG()
、
LEAD()
LEAD()
、
FIRST_VALUE()
FIRST_VALUE()
、
LAST_VALUE()
LAST_VALUE()
、
NTILE()
NTILE()
、
SUM() OVER()
SUM() OVER()
、
AVG() OVER()
AVG() OVER()
等。
表生成函数
Lakehouse 支持 Spark 常用的表生成函数:
函数
支持
说明
explode()
explode()
✅
完全兼容,支持
LATERAL VIEW
LATERAL VIEW
和表函数语法
posexplode()
posexplode()
⚠️
AS (pos, val)
AS (pos, val)
别名语法支持;
LATERAL VIEW ... AS pos, val
LATERAL VIEW ... AS pos, val
语法不支持,见下方说明
inline()
inline()
✅
直接调用
stack()
stack()
✅
直接调用
json_tuple()
json_tuple()
✅
完全兼容,支持
LATERAL VIEW
LATERAL VIEW
语法
-- explode 两种写法均支持
SELECT id, item FROM orders LATERAL VIEW explode(items) t AS item;
SELECT id, item FROM orders, explode(items) AS t(item);
-- posexplode:AS (pos, val) 别名语法支持
SELECT pos, val FROM posexplode(ARRAY('a', 'b', 'c')) AS t(pos, val);
-- 返回:(0,'a'), (1,'b'), (2,'c')
-- posexplode:LATERAL VIEW AS pos, val 语法不支持,需改写
-- Spark 写法(不支持):
-- SELECT pos, val FROM t LATERAL VIEW posexplode(items) tmp AS pos, val
-- Lakehouse 改写:
SELECT pos, val FROM t, posexplode(items) AS tmp(pos, val);
-- inline/stack/json_tuple 直接调用
SELECT inline(ARRAY(named_struct('a', 1), named_struct('a', 2)));
SELECT stack(2, 'a', 1, 'b', 2);
SELECT a, b FROM json_tuple('{"a":1,"b":2}', 'a', 'b') AS j(a, b);
-- Spark
CREATE OR REPLACE TEMP VIEW temp_orders AS SELECT * FROM orders WHERE status = 'active';
-- Lakehouse:改为持久化视图,需要有 Schema 写权限
CREATE OR REPLACE VIEW my_schema.temp_orders AS SELECT * FROM orders WHERE status = 'active';
⚠️ 持久化 VIEW 会写入 Schema,不会随会话结束自动清理,需要手动
DROP VIEW
DROP VIEW
或纳入表生命周期管理。
方案二:改为 CTE(适合单次查询内的临时逻辑)
-- Spark
CREATE OR REPLACE TEMP VIEW daily_orders AS
SELECT order_date, SUM(amount) AS total FROM orders GROUP BY order_date;
SELECT * FROM daily_orders WHERE total > 1000;
-- Lakehouse:改为 CTE,逻辑内联在查询中
WITH daily_orders AS (
SELECT order_date, SUM(amount) AS total FROM orders GROUP BY order_date
)
SELECT * FROM daily_orders WHERE total > 1000;
-- Spark SQL UDF
CREATE FUNCTION multiply(x INT, y INT) RETURNS INT RETURN x * y;
-- Lakehouse SQL Function(语法相同)
CREATE FUNCTION my_schema.multiply(x INT, y INT) RETURNS INT RETURN x * y;
与 Spark 的关键差异:必须加 Schema 前缀
Lakehouse SQL Function 是 Schema 级对象,调用时默认需要加 Schema 前缀:
SQL Function 也支持表函数(返回多行),对应 Spark 的 UDTF 中纯 SQL 可表达的场景:
-- 返回多行的 SQL Function(表函数)
CREATE FUNCTION my_schema.get_employees(dept INT)
RETURNS TABLE(name STRING)
RETURN SELECT name FROM employee WHERE deptno = dept;
SELECT * FROM my_schema.get_employees(10);
-- 第 4 步:创建 External Function(Java UDF 示例)
CREATE EXTERNAL FUNCTION my_schema.my_upper
AS 'com.example.GenericUdfUpper'
USING ARCHIVE 'volume://fc_volume/udfs/my_upper.zip'
CONNECTION my_fc_conn
WITH PROPERTIES ('remote.udf.api' = 'java8.hive2.v0');
-- 调用方式与普通函数相同(需加 Schema 前缀)
SELECT my_schema.my_upper(name) FROM users;
-- Python UDF 示例
CREATE EXTERNAL FUNCTION my_schema.clean_phone
AS 'handler.clean_phone'
USING FILE 'volume:user://~/clean_phone.zip'
CONNECTION my_fc_conn
WITH PROPERTIES ('remote.udf.api' = 'python3.mc.v0');
SELECT my_schema.clean_phone(phone_number) FROM users;
⚠️ External Function 的创建语法与 SQL Function 不同,不支持在 DDL 中声明参数类型和返回类型,类型由函数代码内部处理。详见 CREATE EXTERNAL FUNCTION。