get_json_object(string json_string, string path)
解析 json 字符串 json_string ,返回 path 指定的 json 对象,返回该对象的 json 字符串
如果输入 json 字符串无效,将返回 null
该函数每次只能返回一个数据项
drop table if exists test.test; create table IF NOT EXISTS test.test( json string ); insert into test.test values ('{"id": 1,"ids": [101,102,103],"total_number": 3}'), ('{"id": 2,"ids": [201,202,203,204],"total_number": 4}'), ('{"id": 3,"ids": [301,302,303,304,305],"total_number": 5}'); -- get 单层值 select get_json_object(json, "$.id") id, get_json_object(json, "$.ids") ids, get_json_object(json, "$.total_number") num from test.test; -- get 数组 select get_json_object(json, "$.id") id, get_json_object(json, "$.ids[0]") ids0, get_json_object(json, "$.ids[1]") ids1, get_json_object(json, "$.ids[2]") ids2, get_json_object(json, "$.ids[3]") ids3, get_json_object(json, "$.total_number") num from test.test;
json_tuple(string jsonStr,string k1,...,string kn)
类似上面的 get_json_object(string json_string, string path) 函数,但可以一次获取多个 jsonPath
参数为一组键 k1,k2…
这是一个 UDTF 函数(但遇到重复 key 只会返回后一个值,不会返回多行)
select json_tuple(json, 'id', 'ids', 'total_number') from test.test; drop table if exists test.test; create table IF NOT EXISTS test.test( json string ); insert into test.test values ('{"a": "a1-1","a": "a1-2","b": "b1","c": "c1"}'), ('{"a": "a2-1","a": "a2-2","b": "b2","c": "c2"}'), ('{"a": "a3-1","a": "a3-2","b": "b3","c": "c3"}'); select json_tuple(json, 'a', 'b', 'c') from test.test;
扩展:
对于简单格式的 json 数据,可使用 get_json_object 、json_tuple 函数处理
get_json_object
json_tuple
对于纯 json 串可使用 JsonSerDe(序列化反序列化)处理
对于嵌套数据类型的复杂 json ,可使用 UDF(自定义函数)
UDF
parse_url(string urlString, string partToExtract [, string keyToExtract])
解析 URL ,从中提取指定的字段
partToExtract 有:HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO
HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO
select parse_url('https://github.com/HonorKnight8/gdflbd', 'HOST'), parse_url('https://github.com/HonorKnight8/gdflbd', 'PATH'), parse_url('https://hostname.com/path?k1=v1&k2=v2#ref1', 'REF'), parse_url('https://hostname.com/path?k1=v1&k2=v2#ref1', 'QUERY', 'k2');
parse_url_tuple(string urlStr,string p1,...,string pn)
类似 parse_url 函数,当可以一次获取多个字段
parse_url
partToExtract 有:HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO, QUERY:<KEY>
HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO, QUERY:<KEY>
这是一个 UDTF 函数
select parse_url_tuple('https://hostname.com/path?k1=v1&k2=v2#ref1', 'PROTOCOL', 'QUERY', 'QUERY:k1');
参考
xpath(xml, xpath)
从 xml 字符串找到与 xpath 表达式匹配的 xml 节点,返回数组
-- 获取指定节点的值 SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/text()'); SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()'); SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/c/text()'); -- 获取所有名为 id 的属性值 select xpath('<a><b id="foo">b1</b><b id="bar">b2</b></a>','//@id'); -- 获取 class 属性为 bb 的节点 SELECT xpath ('<a><b class="bb">b1</b><b>b2</b><b>b3</b><c class="bb">c1</c><c>c2</c></a>', 'a/*[@class="bb"]/text()');
xpath_string(xml, xpath)
从 xml 字符串中找到与 xpath 表达式匹配的第一个 xml 节点返回字符串
SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c'); SELECT xpath_string('<a><b>b1</b><b>b2</b></a>','a/b'); SELECT xpath_string('<a><b>b1</b><b>b2</b></a>','a/b[2]'); SELECT xpath_string('<a><b>b1</b><b>b2</b></a>','a'); -- 从第一个节点获取具有值为“ b_2”的属性“ id”的文本: SELECT xpath_string ('<a><b>b1</b><b id="b_2">b2</b></a>', 'a/b[@id="b_2"]');
xpath_boolean(xml, xpath)
匹配到节点而且该节点的值符合 xpath 指定的条件(如果指定了),才返回 true
否则返回 false
SELECT xpath_boolean('<a><b>1</b></a>', 'a/b'), -- 匹配到节点 xpath_boolean ('<a><b>b</b></a>', 'a/c'), -- 匹配不到该节点 xpath_boolean('<a><b>1</b></a>', 'a/b = 1'), -- 该节点的值确实等于 1 xpath_boolean('<a><b>1</b></a>', 'a/b = 2'), -- 该节点的值并不等于 2 xpath_boolean ('<a><b>10</b></a>', 'a/b < 10');
xpath_short(xml, xpath) 或 xpath_int(xml, xpath) 或 xpath_long(xml, xpath)
xpath_short(xml, xpath)
xpath_int(xml, xpath)
xpath_long(xml, xpath)
返回 xml 字符串中经过 xpath 表达式计算后的值
如果不匹配,则返回 0
如果溢出,返回该数据类型的最大值
SELECT xpath_int ('<a>this is not a number</a>', 'a'), -- 不匹配 xpath('<a><b>1</b><b>2</b></a>', 'a/b/text()'), xpath_int('<a><b>1</b><b>2</b></a>', 'sum(a/b)'), xpath_long('<a><b>10.5</b><c>11.2</c></a>','sum(a/*)'), xpath_int ('<a><b>20000000000</b><c>40000000000</c></a>', 'a/b * a/c'), -- 溢出 xpath_int ('<a><b class="odd">1</b><b class="even">2</b><b class="odd">4</b><c>8</c></a>', 'sum(a/*)'), xpath_int ('<a><b class="odd">1</b><b class="even">2</b><b class="odd">4</b><c>8</c></a>', 'sum(a/b)'), xpath_int ('<a><b class="odd">1</b><b class="even">2</b><b class="odd">4</b><c>8</c></a>', 'sum(a/b[@class="odd"])');
xpath_float(xml, xpath) 或 xpath_double(xml, xpath) 或 xpath_number(xml, xpath)
xpath_float(xml, xpath)
xpath_double(xml, xpath)
xpath_number(xml, xpath)
后两个实际是同一个函数:org.apache.hadoop.hive.ql.udf.xml.UDFXPathDouble
org.apache.hadoop.hive.ql.udf.xml.UDFXPathDouble
不匹配,返回 0
匹配到非数字,返回 NaN
SELECT xpath_double ('<a>b</a>', 'a = 10'), -- 不匹配 xpath_double ('<a>this is not a number</a>', 'a'), -- 匹配到非数字 xpath_double ('<a><b>2000000000</b><c>40000000000</c></a>', 'a/b * a/c'), -- 大数 xpath_double('<a><b>1</b><b>2</b></a>','sum(a/b)'), xpath_double('<a><b>10.5</b><c>11.2</c></a>','sum(a/*)');