标题 创建时间 发布时间 最后一次修改 备注
Hive 常见问题速查 2020.08.25 2020.12.21 2021.08.10 /

函数大全

https://www.iteblog.com/archives/2258.html

Hive 结果采样

select * from my_table
where rand() <= 0.0001
distribute by rand()
sort by rand()
limit 10000;

--- source: http://www.joefkelley.com/736/

hive 填补空值

--- https://stackoverflow.com/questions/63891981/sql-hive-replace-null-values-with-0-hadoop-hive
select
	coalesce(t2.Apple, 0) as apple
from
	table;

hive 分箱

# https://www.jianshu.com/p/a67280db6b33

-- Ntile(n) over(order by col):分块函数
-- 备注:NULL值的处理,是否需要单独为1组。

select  col
        -- NULL默认为最小值
        , ntile(2) over( order by col) as group1
        -- 将NULL单独为1组
        , if(col is null, null, ntile(2) over( partition by if(col is null, 1, 0) order by col) as group2
from(
    select cast(col as int) as col
    from(
        select stack(5, 'NULL', '1', '2', '3', '4') as col
    ) as a
) as a

计算时间差

--- https://blog.csdn.net/kent7306/article/details/50441967
--- 计算前后差

--- https://blog.csdn.net/qq_33290422/article/details/81204532
--- 直接计算时间差

json 提取

--- refer: https://www.cnblogs.com/drjava/p/10486134.html
--- [{"name":"张三","sex":"男","age":"25"},{"name":"李","sex":"男","age":"47"}]

SELECT get_json_object(xjson,"$.[0]") FROM person;
--- {"name":"王二狗","sex":"男","age":"25"}

--- 按字段提取
SELECT get_json_object(xjson,"$.[0].age") FROM person;
--- 25

--- **通配提取,不带索引即可**
SELECT get_json_object(xjson,"$.[].age") FROM person;
--- ["25","47"]

--- 通配多层嵌套,字段名后面加[]
SELECT get_json_object(xjson,"$.name[].xx") FROM person;

临时表运用

--- refer: https://www.jianshu.com/p/d518e9f5d5f9
--- note: hive 嵌套使用不支持

with temp as (
    select * from xxx
)
select * from temp;

行转列 pivot

--- 合并多行结果到一列

select user
   , concat_ws(',', collect_set(item)) as item_list -- collect set list 最后结果不同
from user_item_table
group by user;


--- https://stackoverflow.com/questions/23025380/how-to-transpose-pivot-data-in-hive
SELECT
  uid,
  kv['c1'] AS c1,
  kv['c2'] AS c2,
  kv['c3'] AS c3
FROM (
  SELECT uid, to_map(key, value) kv
  FROM vtable
  GROUP BY uid
) t

--- https://analyticshut.com/pivot-rows-to-columns-hive/
SELECT
  resource_id,
  COLLECT_SET(quarter_1) [ 0 ] AS quarter_1_spends,
  COLLECT_SET(quarter_2) [ 0 ] AS quarter_2 spends,
  COLLECT_SET(quarter_3) [ 0 ] AS quarter_3_spends,
  COLLECT_SET(quarter_4) [ 0 ] AS quarter_4_spends
FROM
  (
    SELECT
      resource_id,
      CASE
        WHEN quarter = 'Q1' THEN amount
      END AS quarter_1,
      CASE
        WHEN quarter = 'Q2' THEN amount
      END AS quarter_2,
      CASE
        WHEN quarter = 'Q3' THEN amount
      END AS quarter_3,
      CASE
        WHEN quarter = 'Q4' THEN amount
      END AS quarter_4
    FROM
      billing_info
  ) tbl1
GROUP BY
  resource_id;

列转行 unpivot

--- refer: https://stackoverflow.com/questions/39353425/how-to-convert-columns-into-rows-in-hive
select
  col1,
  col2,
  value
from
  orig_table lateral view explode(array(col3, col4, col5)) orig_table_alias as value;
  
 --- unpivot https://stackoverflow.com/questions/23025380/how-to-transpose-pivot-data-in-hive
SELECT t1.uid, t2.key, t2.value
  FROM htable t1
  LATERAL VIEW explode (map(
    'c1', c1,
    'c2', c2,
    'c3', c3
  )) t2 as key, value

结果存储

--- https://www.cnblogs.com/Allen-rg/p/9295615.html

取尾数

***
and substr(item,-1) = 0  -- 采样取尾数为0

left join 注意事项

--- select 无选定内容时,会出现null结果

full out join 选择不为空的列值

--- refer https://www.jianshu.com/p/cb5c42bebd15

SELECT
  COALESCE(table1.id, table2.id) AS id,
  table1.name,
  table1.family,
  table2.orderr,
  table2.countt
FROM
  table1
  FULL OUTER JOIN table2 ON table1.id = table2.id

hive 日期处理

--- refer https://my.oschina.net/u/4308120/blog/3321350
select date_add('2020-01-01',1)  -- 日期加一
select date_sub('2020-01-01',1)  -- 日期减一

--- refer https://sqlandhadoop.com/hive-date-functions-all-possible-date-operations/
select date_format(current_date,'u'); -- Day of the Week

rlike 匹配多个条件

--- refer https://stackoverflow.com/questions/33626645/hive-query-with-multiple-like-operators
***
WHERE some_col rlike 'abc|pqr|xyz' --- 默认前后通配 %*%

partition 排序

--- refer https://www.cnblogs.com/52xf/p/4209211.html

--- ROW_NUMBER 不出现重复数字 1 2 3 4 
--- RANK 同一组数字相同,但后续会跳过 1 1 3 4
--- DENSE_RANK 同一组数字相同,后续不跳过 1 1 2 3

时间戳 timestamp to date

--- https://stackoverflow.com/questions/7211224/how-to-convert-unix-epoch-time-to-date-string-in-hive

select date_format(from_unixtime(epoch_datetime),'yyyyMM') as formatted_date from myHiveTable;

hdfs 操作

# https://www.cnblogs.com/lizm166/p/13355183.html
hdfs dfs -ls
hdfs dfs -mkdir [-p] <paths>
hdfs dfs -get [-ignoreCrc] [-crc] <src> <localdst>

关于作者