hive窗口函数
目录
1.创建表格 插入数据
CREATE EXTERNAL TABLE IF NOT EXISTS learn3.student1(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别",
clazz STRING COMMENT "班级"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
load data local inpath "/usr/local/soft/hive-3.1.2/data/students.txt" INTO TABLE learn3.student1;
CREATE EXTERNAL TABLE IF NOT EXISTS learn3.score1(
id STRING COMMENT "学生ID",
subject_id STRING COMMENT "科目ID",
score int COMMENT "成绩"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
load data local inpath "/usr/local/soft/hive-3.1.2/data/score.txt" INTO TABLE learn3.score1;
2.需求:
1)统计各性别年龄前三
-- HIVE 原先老版本 不支持这种写法
select
s1.*
from learn3.student1 as s1
where 3 > (select count(*) from learn3.student1 as s2
where s1.gender=s2.gender and s1.age<s2.age);
2)统计各班级学生总成绩前三名
①算出学生总成绩
WITH score_sum AS(
select
id
,sum(score) as total_score
from learn3.score1
group by id
)
select
T2.name,T1.total_score,T2.clazz
from score_sum t1
JOIN learn3.student1 t2 on t1.id=t2.id;
② 排序获取前三名
获取前三名 我们HIve这里提供了一个叫窗口函数的
ROW_NUMBER:给数据做标记
OVER:窗口
ROW_NUMBER() OVER(PARTITION BY 分组字段 ORDER BY 排序字段)
①算出学生总成绩
WITH score_sum AS (
SELECT
id
,sum(score) as total_score
FROM learn3.score1
GROUP BY id
)
, studen_score AS (
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id
)
--② 排序获取前三名
SELECT
TT.*
FROM (
SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as row_pm
FROM studen_score T
) TT
WHERE TT.row_pm <= 3
③总结
##### row_number :无并列排名
* 用法: select xxxx, row_number() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx
##### dense_rank :有并列排名,并且依次递增
* 用法: select xxxx, dense_rank() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx
##### rank :有并列排名,不依次递增
* 用法: select xxxx, rank() over(partition by 分区字段 order by 排序字段 desc) as rn from tb group by xxxx
##### percent_rank:(rank的结果-1)/(分区内数据的个数-1)
* 用法: select xxxx, percent_rank() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx
3)统计各班级学生总成绩最高的
① 上一个程序直接改为1
WITH score_sum AS (
SELECT
id
,sum(score) as total_score
FROM learn3.score1
GROUP BY id
)
, studen_score AS (
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id
)
SELECT
TT.*
FROM (
SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as row_pm
FROM studen_score T
) TT
WHERE TT.row_pm == 1
② max
创建表格
CREATE TABLE learn3.student_score(
name STRING COMMENT "",
total_score int COMMENT "",
clazz STRING COMMENT ""
);
插入数据
WITH score_sum AS (
SELECT
id
,sum(score) as total_score
FROM learn3.score1
GROUP BY id
)
INSERT INTO TABLE learn3.student_score
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id;
-- 通过max方法取出每个班级分区中的学生成绩最大值
SELECT
TT.*
FROM (
SELECT
T1.*
,max(T1.total_score) OVER(PARTITION BY T1.clazz ORDER BY T1.total_score DESC) as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score;
-- 如果max() OVER() 中的窗口over()没有给定分区,那么当前的窗口表示整个学校,得到的数据是整个学校的最高的分数
SELECT
TT.*
FROM (
SELECT
T1.*
,max(T1.total_score) OVER() as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score;
③总结
max
用法:
① max(T1.total_score) OVER(PARTITION BY T1.clazz ORDER BY T1.total_score DESC)
基于每个partition分区内数据取最大值
② max(T1.total_score) OVER()
基于整个数据集取最大值
min、avg、count、sum:与max方法使用一致