数据源配置:JDBC 任务需要预先在 Studio 管理 → 数据源 里配置好外部数据库连接,执行时通过
--datasource
--datasource
指定数据源名称,
--database
--database
指定库名。
⚠️ 支持的数据源:本文以 MySQL 为例,经过实际验证。Hive、ClickHouse 等其他数据源的支持情况以 Studio 数据源配置页面为准。
场景一:数据同步前探索
在配置批量同步任务之前,先用 JDBC 任务了解源库的基本情况。
SQL 脚本
-- 查看目标表的字段结构
SELECT
column_name,
column_type,
is_nullable,
column_key,
column_default
FROM information_schema.columns
WHERE table_schema = DATABASE()
AND table_name = 'orders'
ORDER BY ordinal_position;
-- 查看数据量和时间范围
SELECT
COUNT(*) AS total_rows,
MIN(order_date) AS earliest_date,
MAX(order_date) AS latest_date,
COUNT(DISTINCT user_id) AS unique_users
FROM orders;
-- 查看各状态分布
SELECT status, COUNT(*) AS cnt
FROM orders
GROUP BY status
ORDER BY cnt DESC;
-- 抽样查看数据
SELECT * FROM orders ORDER BY order_id DESC LIMIT 10;
-- ── 1. 整体数据质量概览 ──────────────────────────────────────────────────
SELECT
COUNT(*) AS total_rows,
COUNT(DISTINCT order_id) AS unique_orders,
COUNT(*) - COUNT(DISTINCT order_id) AS duplicate_pks,
SUM(CASE WHEN user_id IS NULL OR user_id = 0 THEN 1 END) AS null_user_id,
SUM(CASE WHEN product IS NULL OR product = '' THEN 1 END) AS null_product,
SUM(CASE WHEN amount <= 0 THEN 1 END) AS invalid_amount,
SUM(CASE WHEN status NOT IN ('paid','pending','cancelled')
THEN 1 END) AS invalid_status
FROM doc_orders_raw;
-- ── 2. 列出所有异常记录 ──────────────────────────────────────────────────
SELECT order_id, user_id, product, amount, status, order_date,
CASE
WHEN user_id IS NULL OR user_id = 0 THEN 'null_user_id'
WHEN product IS NULL OR product = '' THEN 'null_product'
WHEN amount <= 0 THEN 'invalid_amount'
WHEN status NOT IN ('paid','pending','cancelled') THEN 'invalid_status'
END AS issue_type
FROM doc_orders_raw
WHERE user_id IS NULL OR user_id = 0
OR product IS NULL OR product = ''
OR amount <= 0
OR status NOT IN ('paid','pending','cancelled')
ORDER BY order_id;
-- ── 3. 重复数据检查(同一用户同一商品同一天) ────────────────────────────
SELECT user_id, product, order_date, COUNT(*) AS cnt
FROM doc_orders_raw
GROUP BY user_id, product, order_date
HAVING COUNT(*) > 1
ORDER BY cnt DESC;
-- ── 4. 数据分布检查(用于判断同步范围) ─────────────────────────────────
SELECT
order_date,
COUNT(*) AS total,
SUM(CASE WHEN status = 'paid' THEN 1 END) AS paid,
SUM(CASE WHEN status = 'pending' THEN 1 END) AS pending,
ROUND(SUM(amount), 2) AS total_amount
FROM doc_orders_raw
GROUP BY order_date
ORDER BY order_date;