数据处理
Hive

Hive 内置函数

Hive 内置函数的详细描述请参阅官方文档 (opens new window)，以下展示常用部分。

# 查看系统函数

show functions;

# 显示自带函数用法

desc function upper;
desc function extended upper;

1
2

例：

desc function length;

# 字符串函数

-- length(str | binary)，返回字符串长度或bytes长度
select length('abc123');
> 6

-- substr(str, pos[, len])，截取字符串。str 为输入的字符串，pos 为起始位置（下标从 1 开始），len 为截取长度（可以省略）
select substr('abcdef', 2);
> 'bcdef'
select substr('abcdef', 2, 2);
> 'bc'

-- upper(str)，字符串转大写
select upper('Facebook');
> 'FACEBOOK'

-- lower(str)，字符串转小写
select lower('Facebook');
> 'facebook'

-- trim(str)，去除字符串左右两边的空格
select trim('   abc  def  ');
> 'abc  def'

-- ltrim(str)，去除左边的空格
select ltrim('   abc  def  ');
> 'abc  def  '

-- rtrim(str)，去除右边的空格
select rtrim('   abc  def  ');
> '   abc  def'

-- reverse(str)，字符串反转
select reverse('abcde');
> 'edcba'

-- regexp_replace(str, regexp, rep)，正则替换
select regexp_replace('100-200', '(\d+)', 'num')
> 'num-num'

-- get_json_object(json_txt, path)，根据指定的 json 路径从 json 字符串中提取 json 对象，并返回提取的 json 对象的 json 字符串。如果输入的 json 字符串无效，它将返回 null。
-- path 的表达符:
    -- $ : Root object
    -- . : Child operator
    -- [] : Subscript operator for array
    -- * : Wildcard for []
-- 假设有字符串为：
    -- {"store":
    --   {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
    --    "bicycle":{"price":19.95,"color":"red"}
    --   },
    --  "email":"amy@only_for_json_udf_test.net",
    --  "owner":"amy"
    -- }
select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"we
ight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} },"email":"amy@onl
y_for_json_udf_test.net","owner":"amy"}','$.owner');
> 'amy'

select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"we
ight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} },"email":"amy@onl
y_for_json_udf_test.net","owner":"amy"}','$.store.fruit\[0]');
> {"weight":8,"type":"apple"}

select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"we
ight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} },"email":"amy@onl
y_for_json_udf_test.net","owner":"amy"}','$.non_exist_key');
> NULL

-- concat(str1, str2, ... strN)，将多个字符串参数连接并返回。
select concat('abc', 'def');
> 'abcdef'

-- concat_ws(separator, [string | array(string)]+)，将多个字符串连接，并用指定分隔符分隔
select concat_ws('-', 'abc', 'def');
> 'abc-def'

-- space(n)，返回 n 个空格
select space(2);
> '  '

-- split(str, regex)，分隔字符串
select split('oneAtwoBthreeC', '[ABC]');
> ["one","two","three",""]

select split('abc,def', ',');
> ["abc","def"]

-- find_in_set(str,str_array)，返回 str 在 str_array 第一次，str_array是逗号分隔的字符串。
select find_in_set('ab','abc,b,ab,c,def');
> 3

-- regexp_extract(str, regexp[, idx])，提取正则匹配到的指定组内容
select regexp_extract('100-200', '(\\d+)-(\\d+)', 2);
> 200

-- parse_url(url, partToExtract[, key])，url 解析函数
select parse_url('http://facebook.com/path/p1.php?query=1', 'HOST')
> 'facebook.com'

select parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')
> 'query=1'

select parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY', 'query')
> '1'

-- repeat(str, n)，重复 str 字符串 n 次
select repeat('abc', 3);
> 'abcabcabc'

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

# 日期函数

-- current_date()，获取当前日期
select current_date();
> '2022-01-08'

-- unix_timestamp([date[, pattern]])，返回 UNIX 时间戳，将当前或指定时间转换为自 1970-01-01 以来的秒数。
select unix_timestamp();
> 1641642834

select unix_timestamp('2022-01-08 12:12:12');
> 1641615132

-- current_timestamp()，获取当前时间戳，同一个调用中的多个 current_timestamp 返回相同的结果
select current_timestap();

-- from_unixtime(unix_time, format)，时间戳转日期
select from_unixtime(1641615132);
> '2022-01-08 12:12:12'

select from_unixtime(1641615132, 'yyyyMMdd');
> '20220108'

select from_unixtime(1641615132, 'yyyy-MM-dd HH:mm:ss');
> '2022-01-08 12:12:12'

-- to_date(expr)，提取日期
select to_date('2022-01-08 12:12:12');
> '2022-01-08'

-- year(param)，返回 date/时间戳/interval 的年份部分
select year('2022-01-08');
> 2022

-- month(param)，返回 date/时间戳/interval 的月份部分
select month('2022-01-08');
> 1

-- day(param)，返回 date/时间戳/interval 中的天部分
select day('2022-01-08');
> 8

-- hour(param)，返回 date/时间戳/interval 的小时部分
select hour('2022-01-08 12:10:00');
> 12

-- minute(param)，返回 date/时间戳/interval 的分钟部分
select minute('2022-01-08 12:10:00');
> 10

-- second(param)，返回 date/时间戳/interval 的秒部分
select second('2022-01-08 12:10:00');
> 0

-- weekofyear(date)，返回指定日期为当年的第几周
select weekofyear('2022-01-08');
> 1

-- datediff(date1, date2)，返回两个日期间隔的天数
select datediff('2022-01-08', '2022-01-01');
> 7

-- date_add(start_date, num_days)，日期增加指定天数
select date_add('2021-01-08', 2);
> '2021-01-10'

-- date_sub(start_date, num_days)，日期减去指定天数
select date_sub('2021-01-08', 2);
> '2021-01-06'

-- add_months(start_date, num_months)，返回指定日期 num_months 月后的第一天
select add_months('2022-01-08', 2);
> '2022-03-08'

-- dayofmonth(param)，查询当月第几天
select dayofmonth('2022-01-08');
> 8

-- last_day(date)，计算月末
select last_day('2022-01-08');
> '2022-01-31'

-- date_format(date/timestamp/string, fmt)，将date、timestamp、string按照指定格式输出为字符串
select date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss');
> '2022-01-08 20:27:33'

select date_format(current_date(), 'yyyyMMdd');
> '20220108'

select date_format('2022-01-08', 'yyyy-MM-dd HH:mm:ss');
> '2022-01-08 00:00:00'

# 数学函数

-- round(x[, d])，四舍五入，d 表示保留几位小数，默认为 1
select round(3.1415926);
> 3.0
select round(3.1415926, 2);
> 3.14
select round(3.1415926, 3);
> 3.142

-- ceil(x)，向上取整
select ceil(3.1415926);
> 4

-- floor(x)，向下取整
select floor(3.1415926);
> 3

-- abs(x)，求绝对值
select abs(-1)
> 1

-- rand([seed])，返回 0 到 1 之间的随机数，指定 seed 可以得到一个稳定的随机数
select rand();
> 0.05166193359544935

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

# 条件函数

-- if(expr1,expr2,expr3)，如果满足 expr1，则返回 expr2 的值，否则返回 expr3
select if(1 < 2, 1, 2);
> 1

-- isnull(expr)，判断是否为 null
select isnull(1);
> false

-- isnotnull(expr)，判断是否非 null
> true

-- nvl(value, default_value)，如果非空，返回 value，否则返回 default_value
select nvl('a', 'b');
> 'a'

select nvl(null, 'b');
> 'b'

-- case a when b then c [when d then e]* [else f] end，当 a = b，返回 c，当 a = d，返回 e，否则返回 f
> 
select
    case 1 + 1
        when 1 then 'A'
        when 2 then 'B'
        else 'C'
    end;
> 'B'

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

# 集合函数

-- size(a)，返回 a 的大小
select size(array(1, 2, 3));
> 3

-- map_keys(map)，返回 map 的所有 key
select map_keys(map('a', 1, 'b', 2, 'c', 3));
> ["a","b","c"]

-- map_values(map)，返回 map 的所有 value
select map_values(map('a', 1, 'b', 2, 'c', 3));
> [1,2,3]

-- array_contains(array, value)，判断数组是否包含某个值
select array_contains(array(1,2,3), 2);
> true

-- sort_array(array(obj1, obj2,...))，数组排序
select sort_array(array(2,1,4,3));
> [1,2,3,4]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

# 数据脱敏函数

-- mask

-- mask_first_n

-- mask_last_n

-- mask_show_first_n

-- mask_show_last_n

-- mask_hash

1
2
3
4
5
6
7
8
9
10
11
12

上次更新: 2023/11/01, 03:11:44

← Hive DQL Hive UDF→