spark-sql> desc test2;
userid int NULL
date string NULL
list_01
array<string> NULL
target string NULL
group_num int NULL
Time taken: 0.526 seconds, Fetched 5 row(s)
spark-sql> select * from test2;
1 20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 1:20220303 1
1 20220202 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 1:20220202 1
1 20220202 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 1:20220202 1
2 20220202 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 2:20220202 2
3 20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 3:20220303 2
3 20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 3:20220303 2
3 20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 3:20220303 3
3 20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"] 3:20220303 3
spark-sql> select named_struct('a',target, 'b',list_01) from test2 limit 1;
{"a":"1:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
1、构建 类型 struct<key:string,value:array<string>
spark-sql>
drop table if exists test3;
create table test3 as
select
named_struct('a',target, 'b',list_01) as content_shi_group,
group_num
from
test2;
{"a":"1:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"a":"1:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"a":"1:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"a":"2:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 3
{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 3
2、按group_num聚合,生成 类型array<struct<key:string,value:array<string>>
spark-sql>
drop table if exists test4;
create table test4 as
select
group_num, collect_list(content_shi_group) as list_struct
from test3
group by group_num;
1 [{"a":"1:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]},{"a":"1:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]},{"a":"1:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}]
3 [{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]},{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}]
2 [{"a":"2:20220202","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]},{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]},{"a":"3:20220303","b":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}]
3、类型 array<struct<key:string,value:array<string>> 转 map<string,array<string>>
spark-sql>
drop table if exists test5;
create table test5 as
select
group_num, map(k1,v1) as map_list
from test4
lateral view inline(list_struct) t1 as k1,v1;
1 {"1:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
1 {"1:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
1 {"1:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
3 {"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
3 {"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
2 {"2:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
2 {"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
2 {"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]}
spark-sql> select inline(list_struct) from test4 where group_num=1;
1:20220303 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]
1:20220202 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]
1:20220202 ["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]
4、直接生成hive map<string, array<String>>类型
spark-sql>
drop table if exists test3_02;
create table test3_02 as
select
map(target, list_01) as content_group,
group_num
from
test2 ;
{"1:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"1:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"1:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 1
{"2:20220202":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 2
{"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 3
{"3:20220303":["1:20220202","1:20220202","2:20220202","3:20220303","3:20220303","3:20220303","3:20220303","1:20220303"]} 3
spark-sql> desc test3_02;
content_group map<string,array<string>> NULL
group_num int NULL