
窗口函数定义
SELECT SUM(salary) AS "月薪总和"
FROM employee;
月薪总和
---------
245800.00
SELECT emp_name AS "员工姓名",
SUM(salary) OVER () AS "月薪总和"
FROM employee;
员工姓名|月薪总和
-------|---------
刘备 |245800.00
关羽 |245800.00
张飞 |245800.00
...
window_function ([expression], ...) OVER (
PARTITION BY ...
ORDER BY ...
frame_clause
)
创建数据分区
SELECT emp_name "员工姓名", salary "月薪", dept_id "部门编号",
SUM(salary) OVER (
PARTITION BY dept_id
) AS "部门合计"
FROM employee;
员工姓名|月薪 |部门编号|部门合计
-------|--------|-------|--------
刘备 |30000.00| 1|80000.00
关羽 |26000.00| 1|80000.00
张飞 |24000.00| 1|80000.00
诸葛亮 |24000.00| 2|39500.00
黄忠 | 8000.00| 2|39500.00
魏延 | 7500.00| 2|39500.00
...
分区内的排序
SELECT emp_name "姓名", salary "月薪", dept_id "部门编号",
RANK() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS "部门排名"
FROM employee;
姓名 |月薪 |部门编号|部门排名
------|--------|-------|-------
刘备 |30000.00| 1| 1
关羽 |26000.00| 1| 2
张飞 |24000.00| 1| 3
诸葛亮|24000.00| 2| 1
黄忠 | 8000.00| 2| 2
魏延 | 7500.00| 2| 3
...
指定窗口大小
{ ROWS | RANGE } frame_start
{ ROWS | RANGE } BETWEEN frame_start AND frame_end
UNBOUNDED PRECEDING,表示窗口从分区的第一行开始。
N PRECEDING,表示窗口从当前行之前的第 N 行开始。
CURRENT ROW,表示窗口从当前行开始。
CURRENT ROW,表示窗口到当前行结束。
N FOLLOWING,表示窗口到当前行之后的第 N 行结束。
UNBOUNDED FOLLOWING,表示窗口到分区的最后一行结束。
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
窗口函数分类
聚合窗口函数(Aggregate Window Function)。许多常见的聚合函数也可以作为窗口函数使用,包括 AVG()、SUM()、COUNT()、MAX() 以及 MIN() 等。
排名窗口函数(Ranking Window Function)。排名窗口函数用于对数据进行分组排名,包括 ROW_NUMBER()、RANK()、DENSE_RANK()、PERCENT_RANK()、CUME_DIST() 以及 NTILE() 等函数。
取值窗口函数(Value Window Function)。取值窗口函数用于返回指定位置上的数据行,包括 FIRST_VALUE()、LAST_VALUE()、LAG()、LEAD()、NTH_VALUE() 等函数。
product|ym |amount
-------|------|--------
苹果 |201801|10159.00
苹果 |201802|10211.00
苹果 |201803|10247.00
苹果 |201804|10376.00
苹果 |201805|10400.00
苹果 |201806|10565.00
...
log_id|log_ts |from_user |to_user |type|amount
------|-------------------|--------------|--------------|----|------
1|2019-01-02 10:31:40|62221234567890| |存款 | 50000
2|2019-01-02 10:32:15|62221234567890| |存款 |100000
3|2019-01-03 08:14:29|62221234567890|62226666666666|转账 |200000
4|2019-01-05 13:55:38|62221234567890|62226666666666|转账 |150000
5|2019-01-07 20:00:31|62221234567890|62227777777777|转账 |300000
6|2019-01-09 17:28:07|62221234567890|62227777777777|转账 |500000
...
聚合窗口函数
案例分析:移动平均值
SELECT product AS "产品", ym "年月", amount "销量",
AVG(amount) OVER (
PARTITION BY product
ORDER BY ym
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
) AS "最近平均销量"
FROM sales_monthly
ORDER BY product, ym;
产品|年月 |销量 |最近平均销量
----|------|--------|------------
桔子|201801|10154.00|10154.000000
桔子|201802|10183.00|10168.500000
桔子|201803|10245.00|10194.000000
桔子|201804|10325.00|10251.000000
桔子|201805|10465.00|10345.000000
桔子|201806|10505.00|10431.666667
...
案例分析:累计求和
SELECT product AS "产品", ym "年月", amount "销量",
SUM(amount) OVER (
PARTITION BY product
ORDER BY ym
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS "累计销量"
FROM sales_monthly
ORDER BY product, ym;
产品|年月 |销量 |累计销量
----|------|--------|---------
桔子|201801|10154.00| 10154.00
桔子|201802|10183.00| 20337.00
桔子|201803|10245.00| 30582.00
桔子|201804|10325.00| 40907.00
桔子|201805|10465.00| 51372.00
桔子|201806|10505.00| 61877.00
...
-- Oracle、MySQL 以及 PostgreSQL
SELECT log_ts, from_user, total_amount
FROM (
SELECT log_ts, from_user,
SUM(amount) OVER (
PARTITION BY from_user
ORDER BY log_ts
RANGE INTERVAL '5' DAY PRECEDING
) AS total_amount
FROM transfer_log
WHERE TYPE = '转账'
) t
WHERE total_amount >= 1000000;
log_ts |from_user |total_amount
-------------------|--------------|------------
2021-01-10 07:46:02|62221234567890| 1050000
-- SQLite
WITH tl(log_ts, unix, from_user, amount) AS (
SELECT log_ts, CAST(STRFTIME('%s', log_ts) AS INT), from_user, amount
FROM transfer_log
WHERE type = '转账'
)
SELECT log_ts, from_user, total_amount
FROM (
SELECT log_ts, from_user,
SUM(amount) OVER (
PARTITION BY from_user
ORDER BY unix
RANGE 5 * 86400 PRECEDING
) AS total_amount
FROM tl
) t
WHERE total_amount >= 1000000;
排名窗口函数
ROW_NUMBER 函数可以为分区中的每行数据分配一个序列号,序列号从 1 开始。
RANK 函数返回当前行在分区中的名次。如果存在名次相同的数据,后续的排名将会产生跳跃。
DENSE_RANK 函数返回当前行在分区中的名次。即使存在名次相同的数据,后续的排名也是连续值。
PERCENT_RANK 函数以百分比的形式返回当前行在分区中的名次。如果存在名次相同的数据,后续的排名将会产生跳跃。
CUME_DIST 函数计算当前行在分区内的累积分布。
NTILE 函数将分区内的数据分为 N 等份,并返回当前行所在的分片位置。
案例分析:分类排名
SELECT d.dept_name AS "部门名称", e.emp_name AS "姓名", e.salary AS "月薪",
ROW_NUMBER() OVER (PARTITION BY e.dept_id ORDER BY e.salary DESC) AS "row_number",
RANK() OVER (PARTITION BY e.dept_id ORDER BY e.salary DESC) AS "rank",
DENSE_RANK() OVER (PARTITION BY e.dept_id ORDER BY e.salary DESC) AS "dense_rank",
PERCENT_RANK() OVER (PARTITION BY e.dept_id ORDER BY e.salary DESC) AS "percent_rank"
FROM employee e
JOIN department d ON (e.dept_id = d.dept_id);
部门名称 |姓名 |月薪 |row_number|rank|dense_rank|percent_rank
--------|-----|--------|-----------|----|----------|----------------
行政管理部|刘备 |30000.00| 1| 1| 1| 0.0
行政管理部|关羽 |26000.00| 2| 2| 2| 0.5
行政管理部|张飞 |24000.00| 3| 3| 3| 1.0
...
研发部 |赵云 |15000.00| 1| 1| 1| 0.0
研发部 |周仓 | 8000.00| 2| 2| 2| 0.125
研发部 |关兴 | 7000.00| 3| 3| 3| 0.25
研发部 |关平 | 6800.00| 4| 4| 4| 0.375
研发部 |赵氏 | 6600.00| 5| 5| 5| 0.5
研发部 |廖化 | 6500.00| 6| 6| 6| 0.625
研发部 |张苞 | 6500.00| 7| 6| 6| 0.625
研发部 |赵统 | 6000.00| 8| 8| 7| 0.875
...
-- MySQL、Oracle、PostgreSQL 以及 SQLite
SELECT d.dept_name AS "部门名称", e.emp_name AS "姓名", e.salary AS "月薪",
ROW_NUMBER() OVER w AS "row_number",
RANK() OVER w AS "rank",
DENSE_RANK() OVER w AS "dense_rank",
PERCENT_RANK() OVER w AS "percent_rank"
FROM employee e
JOIN department d ON (e.dept_id = d.dept_id)
WINDOW w AS (PARTITION BY e.dept_id ORDER BY e.salary DESC);
WITH ranked_emp AS (
SELECT d.dept_name,
e.emp_name,
e.hire_date,
ROW_NUMBER() OVER (PARTITION BY e.dept_id ORDER BY e.hire_date) AS rn
FROM employee e
JOIN department d ON (e.dept_id = d.dept_id)
)
SELECT dept_name "部门名称", emp_name "姓名", hire_date "入职日期", rn "入职顺序"
FROM ranked_emp
WHERE rn <= 2;
部门名称 |姓名 |入职日期 |入职顺序
--------|-----|----------|-------
行政管理部|刘备 |2000-01-01| 1
行政管理部|关羽 |2000-01-01| 2
人力资源部|诸葛亮|2006-03-15| 1
人力资源部|魏延 |2007-04-01| 2
财务部 |孙尚香|2002-08-08| 1
财务部 |孙丫鬟|2002-08-08| 2
...
案例分析:累积分布
SELECT emp_name AS "姓名", salary AS "月薪",
CUME_DIST() OVER (ORDER BY salary) AS "累积占比"
FROM employee;
姓名 |月薪 |累积占比
----|--------|-------
蒋琬 | 4000.00|0.08
邓芝 | 4000.00|0.08
庞统 | 4100.00|0.12
...
关羽 |26000.00|0.96
刘备 |30000.00| 1.0
SELECT emp_name AS "姓名", hire_date AS "入职日期",
NTILE(5) OVER (ORDER BY hire_date) AS "分组位置"
FROM employee;
姓名 |入职日期 |分组位置
-----|----------|-------
刘备 |2000-01-01| 1
关羽 |2000-01-01| 1
张飞 |2000-01-01| 1
孙尚香|2002-08-08| 1
孙丫鬟|2002-08-08| 1
赵云 |2005-12-19| 2
...
简雍 |2019-05-11| 5
取值窗口函数
LAG 函数可以返回窗口内当前行之前的第 N 行数据。
LEAD 函数可以返回窗口内当前行之后的第 N 行数据。
FIRST_VALUE 函数可以返回窗口内第一行数据。
LAST_VALUE 函数可以返回窗口内最后一行数据。
NTH_VALUE 函数可以返回窗口内第 N 行数据。
案例分析:环比、同比分析
SELECT product AS "产品", ym "年月", amount "销量",
((amount - LAG(amount, 1) OVER (PARTITION BY product ORDER BY ym))/ LAG(amount, 1) OVER (PARTITION BY product ORDER BY ym)) * 100 AS "环比增长率(%)"
FROM sales_monthly
ORDER BY product, ym;
产品|年月 |销量 |环比增长率(%)
---|------|--------|------------
桔子|201801|10154.00|
桔子|201802|10183.00| 0.285602
桔子|201803|10245.00| 0.608858
...
香蕉|201904|11408.00| 1.063076
香蕉|201905|11469.00| 0.534712
香蕉|201906|11528.00| 0.514430
SELECT product AS "产品", ym "年月", amount "销量",
((amount - LAG(amount, 12) OVER (PARTITION BY product ORDER BY ym))/ LAG(amount, 12) OVER (PARTITION BY product ORDER BY ym)) * 100 AS "同比增长率(%)"
FROM sales_monthly
ORDER BY product, ym;
产品|年月 |销量 |同比增长率(%)
---|------|--------|------------
桔子|201801|10154.00|
桔子|201802|10183.00|
桔子|201803|10245.00|
...
桔子|201901|11099.00| 9.306677
桔子|201902|11181.00| 9.800648
桔子|201903|11302.00|10.317228
...
案例分析:复合增长率
(15000/10000)(1/2) - 1 = 22.47%
WITH s(product, ym, amount, first_amount, num) AS (
SELECT product, ym, amount,
FIRST_VALUE(amount) OVER (PARTITION BY product ORDER BY ym),
ROW_NUMBER() OVER (PARTITION BY product ORDER BY ym)
FROM sales_monthly
)
SELECT product AS "产品", ym "年月", amount "销量",
(POWER(1.0*amount/first_amount, 1.0/NULLIF(num-1, 0)) - 1) * 100 AS "月均复合增长率(%)"
FROM s
ORDER BY product, ym;
产品|年月 |销量 |月均复合增长率(%)
---|------|--------|-----------------
桔子|201801|10154.00|
桔子|201802|10183.00| 0.285602
桔子|201803|10245.00| 0.447100
桔子|201804|10325.00| 0.558233
桔子|201805|10465.00| 0.757067
桔子|201806|10505.00| 0.681987
...
SELECT product AS "产品", ym "年月", amount "销量",
FIRST_VALUE(ym) OVER (
PARTITION BY product ORDER BY amount DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "最高销量月份",
LAST_VALUE(ym) OVER (
PARTITION BY product ORDER BY amount DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "最低销量月份",
-- Microsoft SQL Server 不支持 NTH_VALUE
NTH_VALUE(ym, 3) OVER (
PARTITION BY product ORDER BY amount DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "第三高月份"
FROM sales_monthly
ORDER BY product, ym;
产品|年月 |销量 |最高销量月份|最低销量月份|第三高月份
---|------|-----|----------|----------|---------
桔子|201801|10154|201906 |201801 |201904
桔子|201802|10183|201906 |201801 |201904
桔子|201803|10245|201906 |201801 |201904
桔子|201804|10325|201906 |201801 |201904
桔子|201805|10465|201906 |201801 |201904
桔子|201806|10505|201906 |201801 |201904
...

