gpt4 book ai didi

hadoop - 在不指定列名和列类型的情况下创建一个配置单元表

转载 作者:可可西里 更新时间:2023-11-01 15:26:22 24 4
gpt4 key购买 nike

我在 HDFS 上存储了包含 1000 列的庞大数据集。我想创建一个配置单元表来过滤和处理数据。

CREATE EXTERNAL TABLE IF NOT EXISTS tablename(
var1 INT,var2 STRING, var2 STRING)
COMMENT 'testbykasa'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/folder1/';

对于较小的编号。列(~ 5-10),我手动指定列名和列类型。有没有办法让配置单元通过推断列名和数据类型来创建表,而无需手动指定它。

最佳答案

演示

我的数据.csv

2,2,8,1,5,1,8,1,4,1,3,4,9,2,8,2,6,5,3,1,5,5,8,0,1,6,0,7,1,4
2,6,8,7,7,9,9,3,8,7,3,1,9,1,7,5,9,7,1,2,5,7,0,5,1,2,6,4,0,4
0,0,1,3,6,5,6,2,4,2,4,9,0,4,9,8,1,0,2,8,4,7,8,3,9,7,8,9,5,5
3,4,9,1,8,7,4,2,1,0,4,3,1,4,6,6,7,4,9,9,6,7,9,5,2,2,8,0,2,9
3,4,8,9,9,1,5,2,7,4,7,1,4,9,8,9,3,3,2,3,3,5,4,8,6,5,8,8,6,4
4,0,6,9,3,2,4,2,9,4,6,8,8,2,6,7,1,7,3,1,6,6,5,2,9,9,4,6,9,7
7,0,9,3,7,6,5,5,7,2,4,2,7,4,6,1,0,9,8,2,5,7,1,4,0,4,3,9,4,3
2,8,3,7,7,3,3,6,9,3,5,5,0,7,5,3,6,2,9,0,8,2,3,0,6,2,4,3,2,6
3,2,0,8,8,8,1,8,4,0,5,2,5,0,2,0,4,1,2,2,1,0,2,8,6,7,2,2,7,0
0,5,9,1,0,3,1,9,3,6,2,1,5,0,6,6,3,8,2,8,0,0,1,9,1,5,5,2,4,8

create external table mycsv (rec string) 
row format delimited
stored as textfile
tblproperties ('serialization.last.column.takes.rest'='true')
;

select      pe.pos + 1              as col
,count(distinct pe.val) as count_distinct_val

from mycsv
lateral view posexplode(split(rec,',')) pe

group by pe.pos
;

+------+---------------------+
| col | count_distinct_val |
+------+---------------------+
| 1 | 5 |
| 2 | 6 |
| 3 | 6 |
| 4 | 5 |
| 5 | 7 |
| 6 | 8 |
| 7 | 7 |
| 8 | 7 |
| 9 | 6 |
| 10 | 7 |
| 11 | 6 |
| 12 | 7 |
| 13 | 7 |
| 14 | 6 |
| 15 | 6 |
| 16 | 9 |
| 17 | 7 |
| 18 | 9 |
| 19 | 5 |
| 20 | 6 |
| 21 | 7 |
| 22 | 5 |
| 23 | 8 |
| 24 | 7 |
| 25 | 5 |
| 26 | 6 |
| 27 | 7 |
| 28 | 8 |
| 29 | 8 |
| 30 | 8 |
+------+---------------------+

关于hadoop - 在不指定列名和列类型的情况下创建一个配置单元表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46264326/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com