gpt4 book ai didi

hadoop - 改变(EMPTY)表添加分区COLUMN

转载 作者:可可西里 更新时间:2023-11-01 15:28:02 25 4
gpt4 key购买 nike

HDP-2.5.0.0 使用 Ambari 2.4.0.1

有几个 SQL Server 和 Oracle 数据库模式需要导入到 HDFS/Hive。

当前的方法运行良好:

  1. Sqoop 以 avro 格式从 RDBMS 导入 HDFS
  2. 在 avro 文件之上创建一个 Hive 外部表,即。 dataaggregate_avro_compressed
  3. 根据步骤 2 创建最终表。 将此步骤自动化
  4. 将第 2 步中表中的数据插入到最终表中

现在,第 3 步。表必须是 ORC + COMPRESSED + PARTITIONED 并且可能是 MANAGED。手动,可以完成以下操作:

CREATE TABLE `dataaggregate_orc_empty`( ......)PARTITIONED BY (`datedimensionid` bigint)  ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' TBLPROPERTIES ('orc.compress'='ZLIB');

自动创建是一个挑战,我正在努力使用以下方法:

  • CTAS空表

    创建表 dataaggregate_orc_empty LIKE dataaggregate_avro_compressed 行格式 SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 存储为输入格式 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 输出格式 ' org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' TBLPROPERTIES ('orc.compress'='ZLIB');

现在,此表包含需要从表中删除的分区列 dateddimensionid,但不支持 'alter table drop column'

  • CTAS 使用 REGEX column spec . :

    设置 hive.support.quoted.identifiers=none;CREATE TABLE dataaggregate_orc_empty AS SELECT (datedimensionid)?+.+ FROM dataaggregate_avro_compressed 限制 0;

这会创建没有分区列 datedimensionid 的表,但现在如何更改此空表以包含分区列,这就是第一种方法遇到困难的地方! documentation talks about adding partitions有规范,但现阶段我还没有——我只是希望这张表与手动创建的相似(在帖子的开头显示)。

我该如何进行?

最佳答案

这是连接 HiveMetaStoreClient 的一种方式您可以使用方法 alter partition

在这个类中,所有其他信息(如分区)都可以与列一起提取。请。请参阅示例客户端和方法。 enter image description here

import org.apache.hadoop.hive.conf.HiveConf;

// test program
public class Test {
public static void main(String[] args){

HiveConf hiveConf = new HiveConf();
hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3);
hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, "thrift://host:port");

HiveMetaStoreConnector hiveMetaStoreConnector = new HiveMetaStoreConnector(hiveConf);
if(hiveMetaStoreConnector != null){
System.out.print(hiveMetaStoreConnector.getAllPartitionInfo("tablename"));
}
}
}


// define a class like this

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.thrift.TException;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class HiveMetaStoreConnector {
private HiveConf hiveConf;
HiveMetaStoreClient hiveMetaStoreClient;

public HiveMetaStoreConnector(String msAddr, String msPort){
try {
hiveConf = new HiveConf();
hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, msAddr+":"+ msPort);
hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
} catch (MetaException e) {
e.printStackTrace();
System.err.println("Constructor error");
System.err.println(e.toString());
System.exit(-100);
}
}

public HiveMetaStoreConnector(HiveConf hiveConf){
try {
this.hiveConf = hiveConf;
hiveMetaStoreClient = new HiveMetaStoreClient(hiveConf);
} catch (MetaException e) {
e.printStackTrace();
System.err.println("Constructor error");
System.err.println(e.toString());
System.exit(-100);
}
}

public String getAllPartitionInfo(String dbName){
List<String> res = Lists.newArrayList();
try {
List<String> tableList = hiveMetaStoreClient.getAllTables(dbName);
for(String tableName:tableList){
res.addAll(getTablePartitionInformation(dbName,tableName));
}
} catch (MetaException e) {
e.printStackTrace();
System.out.println("getAllTableStatistic error");
System.out.println(e.toString());
System.exit(-100);
}

return Joiner.on("\n").join(res);
}

public List<String> getTablePartitionInformation(String dbName, String tableName){
List<String> partitionsInfo = Lists.newArrayList();
try {
List<String> partitionNames = hiveMetaStoreClient.listPartitionNames(dbName,tableName, (short) 10000);
List<Partition> partitions = hiveMetaStoreClient.listPartitions(dbName,tableName, (short) 10000);
for(Partition partition:partitions){
StringBuffer sb = new StringBuffer();
sb.append(tableName);
sb.append("\t");
List<String> partitionValues = partition.getValues();
if(partitionValues.size()<4){
int size = partitionValues.size();
for(int j=0; j<4-size;j++){
partitionValues.add("null");
}
}
sb.append(Joiner.on("\t").join(partitionValues));
sb.append("\t");
DateTime createDate = new DateTime((long)partition.getCreateTime()*1000);
sb.append(createDate.toString("yyyy-MM-dd HH:mm:ss"));
partitionsInfo.add(sb.toString());
}

} catch (TException e) {
e.printStackTrace();
return Arrays.asList(new String[]{"error for request on" + tableName});
}

return partitionsInfo;
}

public String getAllTableStatistic(String dbName){
List<String> res = Lists.newArrayList();
try {
List<String> tableList = hiveMetaStoreClient.getAllTables(dbName);
for(String tableName:tableList){
res.addAll(getTableColumnsInformation(dbName,tableName));
}
} catch (MetaException e) {
e.printStackTrace();
System.out.println("getAllTableStatistic error");
System.out.println(e.toString());
System.exit(-100);
}

return Joiner.on("\n").join(res);
}

public List<String> getTableColumnsInformation(String dbName, String tableName){
try {
List<FieldSchema> fields = hiveMetaStoreClient.getFields(dbName, tableName);
List<String> infs = Lists.newArrayList();
int cnt = 0;
for(FieldSchema fs : fields){
StringBuffer sb = new StringBuffer();
sb.append(tableName);
sb.append("\t");
sb.append(cnt);
sb.append("\t");
cnt++;
sb.append(fs.getName());
sb.append("\t");
sb.append(fs.getType());
sb.append("\t");
sb.append(fs.getComment());
infs.add(sb.toString());
}

return infs;

} catch (TException e) {
e.printStackTrace();
System.out.println("getTableColumnsInformation error");
System.out.println(e.toString());
System.exit(-100);
return null;
}
}
}

关于hadoop - 改变(EMPTY)表添加分区COLUMN,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40085211/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com