大数据Spark “蘑菇云”行动第101课:Hive性能调优之企业级数据倾斜解决方案及对Job数目的优化
set hive.map.aggr=true;
set hive.groupBy.skewindata=true;
select * from logs a join users b on a.userid = b.userid
首先把userid=-1的值先保存到HDFS上,然后专门启动一个MapJoin来进行userid=-1的计算
set hive.optimize.skewjoin=true;
set hive.skewjoin.key=100000
select a.* from (select a.* from (select * from logs where userid = -1) a join (select * from users where userid = -1) b on a.userid = b.userid union all select a.* from logs a join users b on a.userid 0 and a.userid = b.userid) tmp;
set hive.exec.parallel=true;