示例数据
drop table if exists test.test; create table IF NOT EXISTS test.test( name string, salary int, dept string ); insert into test.test values ('刘一', 5000, 'dept1'), ('陈二', 6000, 'dept1'), ('张三', 5000, 'dept1'), ('李四', 4500, 'dept1'), ('王五', 12000, 'dept2'), ('赵六', 15000, 'dept2'), ('孙七', 13000, 'dept2'), ('周八', 8000, 'dept2'), ('吴九', 12000, 'dept2'), ('郑十', 12000, 'dept2'); select * from test.test;
排名函数
从 1 开始,生成数据项在分组中的排名(序号、行号)。
row_number()
rank()
dense_rank()
select salary, row_number() over(order by salary), row_number() over(partition by dept order by salary), rank() over(partition by dept order by salary), dense_rank() over(partition by dept order by salary) from test.test;
cume_dist()
参考
小于等于当前值的行数 / 分组内总行数
select salary, rank() over(order by salary), cume_dist() over(order by salary) from test.test; -- 结果可以理解为小于等于当前薪水的人数占分组总人数的比率
percent_rank()
字面意思是百分比排名
算法是:( 分组内当前行的 rank 值 - 1 )/( 分组内总行数 - 1 )
基本使用:
select salary, rank() over(order by salary), percent_rank() over(order by salary) from test.test;
其结果相当于当前数字在该分组中的百分位数:
with tmp as( select salary, rank() over(order by salary) as rank, percent_rank() over(order by salary) as p_rank from test.test ) select salary, collect_list(p_rank) over() as p_ranks from tmp; with tmp1 as( select salary, rank() over(order by salary) as rank, percent_rank() over(order by salary) as p_rank from test.test ) select chr(10), -- 在结果中插入一个换行(对齐,方便观察结果) collect_list(cast(salary as float)), chr(10), percentile(salary, p_ranks) from( select salary, collect_list(p_rank) over() as p_ranks from tmp1 ) temp2;
ntile(INTEGER x) 分组函数
ntile(INTEGER x)