http://blog.itpub.net/29254281/viewspace-2157111/
这是上文的优化版本
创建试验数据,5天每天一百万随机数据,总共500w数据
create table nums(id int not null primary key);
delimiter $$
create procedure pFastCreateNums(cnt int)
begin
declare s int default 1;
truncate table nums;
insert into nums select s;
while s*2<=cnt do
insert into nums select id+s from nums;
set s=s*2;
end while;
end $$
delimiter ;
call pFastCreateNums(2000000);
drop table if exists t ;
create table t(
query_time date,
ts float,
key(query_time,ts)
);
insert into t select '2018-07-01',round(100000*rand(),2) from nums where id<=1000000;
insert into t select '2018-07-02',round(100000*rand(),2) from nums where id<=1000000;
insert into t select '2018-07-03',round(100000*rand(),2) from nums where id<=1000000;
insert into t select '2018-07-04',round(100000*rand(),2) from nums where id<=1000000;
insert into t select '2018-07-05',round(100000*rand(),2) from nums where id<=1000000;
首先,修正上文的SQL,增加精度,因为在大数据量下,会有显著的误差。
-
select query_time,v,ts
-
from (
-
select t6.query_time,t6.ts,v,seq,
-
case when @gid=concat(seq,'#',query_time) then @rn:=@rn+1 when @gid:=concat(seq,'#',query_time) then @rn:=1 end s
-
from (
-
select query_time,ts,rn,percent,v,v-percent d,seq from (
-
select t2.query_time,ts,rn,round(rn/total,10) percent from (
-
select query_time,ts,
-
case when @gid=query_time then @rn:=@rn+1 when @gid:=query_time then @rn:=1 end rn
-
from (
-
select * from t ,(select @gid:='',@rn:=0) vars order by query_time,ts
-
) t1
-
) t2 inner join (
-
select query_time,count(*) total from t group by query_time
-
) t3 on(t2.query_time=t3.query_time)
-
) t4 ,
-
(select 0.71 v,1 seq union all select 0.81,2 union all select 0.91,3) t5
-
) t6 where d>=0 order by query_time,v,d
- ) t7 where s=1 order by query_time,seq ;
在ssd环境下,上文的SQL运行时长和结果如下.

148.813 s
前文这个SQL的计算结果是非常精确的
但是计算时间和 采样点数量 有巨大关系. 假如原始数据是100w,三个百分位数的采样,则数据扩张到300w;4个百分位数的采样,则数据扩张到400w.这是因为使用笛卡尔积扩张了数据的缘故.
优化版本:
-
select query_time,d,max(ts) ts from (
-
select t2.query_time,ts,rn,round(rn/total,10) percent,
-
case
-
when 0.71>=round(rn/total,10) then 0.71
-
when 0.81>=round(rn/total,10) then 0.81
-
when 0.91>=round(rn/total,10) then 0.91
-
end d
-
from (
-
select query_time,ts,
-
case when @gid=query_time then @rn:=@rn+1 when @gid:=query_time then @rn:=1 end rn
-
from (
-
select * from t ,(select @gid:='',@rn:=0) vars order by query_time,ts
-
) t1
-
) t2 inner join (
-
select query_time,count(*) total from t group by query_time
-
) t3 on(t2.query_time=t3.query_time)
-
) t6
-
where d is not null
- group by query_time,d
结果:

用时:
33.922 秒
这个版本的优点是增加百分位数采样点,不会增加开销.
缺点是在非常小的样本环境中,会有误差.
不过后续也有手段消除误差.