gpt4 book ai didi

java - Dataflow 如何与 BIgQuery 数据集配合使用

转载 作者:行者123 更新时间:2023-11-30 06:06:54 25 4
gpt4 key购买 nike

我没有找到如何从指定的数据集中获取表。我想使用数据流将表从美国数据集迁移到欧盟数据集位置。我想获取美国数据集并行过程中的所有表,并将这些表写入欧盟数据集。

Beam 2.4 使用 com.google.api.services.bigquery v2-rev374-1.22.0。这也是您应该与 Beam 2.4 一起使用的库。

代码使用 DirectRunner 成功运行,但如果我使用 DataflowRunner 运行则不会运行并引发错误

un 29, 2018 1:52:48 PM com.google.api.client.http.HttpRequest execute
ADVERTENCIA: exception thrown while executing request
java.net.SocketException: Unexpected end of file from server
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:851)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1587)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1492)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:347)
at com.google.api.client.http.javanet.NetHttpResponse.<init>(NetHttpResponse.java:37)
at com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:94)
at com.google.api.client.http.HttpRequest.execute(HttpRequest.java:972)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeCurrentRequestWithoutGZip(MediaHttpUploader.java:545)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeCurrentRequest(MediaHttpUploader.java:562)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeUploadInitiation(MediaHttpUploader.java:519)
at com.google.api.client.googleapis.media.MediaHttpUploader.resumableUpload(MediaHttpUploader.java:384)
at com.google.api.client.googleapis.media.MediaHttpUploader.upload(MediaHttpUploader.java:336)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:427)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:352)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:469)
at com.google.cloud.hadoop.util.AbstractGoogleAsyncWriteChannel$UploadOperation.call(AbstractGoogleAsyncWriteChannel.java:357)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

jun 29, 2018 1:52:55 PM com.google.api.client.http.HttpRequest execute
ADVERTENCIA: exception thrown while executing request
java.net.SocketException: Unexpected end of file from server
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:851)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1587)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1492)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:347)
at com.google.api.client.http.javanet.NetHttpResponse.<init>(NetHttpResponse.java:37)
at com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:94)
at com.google.api.client.http.HttpRequest.execute(HttpRequest.java:972)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeCurrentRequestWithoutGZip(MediaHttpUploader.java:545)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeCurrentRequest(MediaHttpUploader.java:562)
at com.google.api.client.googleapis.media.MediaHttpUploader.executeUploadInitiation(MediaHttpUploader.java:519)
at com.google.api.client.googleapis.media.MediaHttpUploader.resumableUpload(MediaHttpUploader.java:384)
at com.google.api.client.googleapis.media.MediaHttpUploader.upload(MediaHttpUploader.java:336)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:427)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:352)
at com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:469)
at com.google.cloud.hadoop.util.AbstractGoogleAsyncWriteChannel$UploadOperation.call(AbstractGoogleAsyncWriteChannel.java:357)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

the code example

package emp.customerjourney.etls;

import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.Bigquery.*;
import com.google.api.services.bigquery.BigqueryScopes;
import com.google.api.services.bigquery.model.*;


import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;


import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.Validation;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

public class Migracion {

public static interface MyOptions extends DataflowPipelineOptions {
@Description("BigQuery table to write to, specified as "+ "<project_id>:<dataset_id>.<table_id>. The dataset must already exist.")
@Default.String("customerjourney:prueba.weather_stations")
@Validation.Required
String getOutput();
void setOutput(String s);

@Description("Table to read from, specified as "+ "<project_id>:<dataset_id>.<table_id>")
@Default.String("customerjourney:118678548.gsod3")
String getInput();
void setInput(String value);


}


public static Bigquery createAuthorizedClient() throws IOException {
// Create the credential
HttpTransport transport = new NetHttpTransport();
JsonFactory jsonFactory = new JacksonFactory();
GoogleCredential credential = GoogleCredential.getApplicationDefault(transport, jsonFactory);

if (credential.createScopedRequired()) {
credential = credential.createScoped(BigqueryScopes.all());
}

return new Bigquery.Builder(transport, jsonFactory, credential)
.setApplicationName("Bigquery Samples")
.build();
}


public static final void main(String args[]) throws Exception {


String projectId = "customerjourney";
String datasetName = "dsorigen";
// Create a new Bigquery client authorized via Application Default Credentials.
Bigquery bigquery = createAuthorizedClient();
Bigquery.Tables.List lista=bigquery.tables().list(projectId,datasetName);
TableList rp= lista.execute();
List<TableList.Tables> tblista =rp.getTables();
String entrada=tblista.get(3).getId();

MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class);

options.setTempLocation("gs://pruebasg/teststaging");
options.setRegion("europe-west1");
options.setStagingLocation("gs://pruebasg/temp_dataflow_tasks");
Pipeline p = Pipeline.create(options);

// Build the table schema for the output table.
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
fields.add(new TableFieldSchema().setName("tornado_count").setType("INTEGER"));
TableSchema schema = new TableSchema().setFields(fields);

// p.apply(BigQueryIO.readTableRows().from(options.getInput()))
p.apply(BigQueryIO.readTableRows().from(entrada)) //get dataset name form api Bigquery V2
.apply(new BigQueryTornadoes.CountTornadoes())
.apply(BigQueryIO.writeTableRows()
.to(options.getOutput())
.withSchema(schema)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));



p.run().waitUntilFinish();

options.getExecutorService().shutdown();
try {
options.getExecutorService().awaitTermination(3, TimeUnit.MINUTES);
} catch (InterruptedException e) {
System.out.println("Thread was interrupted waiting for execution service to shutdown.");
}
System.out.println("termino");

}

}

<dependencies>
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-direct-java</artifactId>
</dependency>

<dependency>
<groupId>com.google.cloud.dataflow</groupId>
<artifactId>google-cloud-dataflow-java-sdk-all</artifactId>
<version>[2.4.0, 2.99)</version>
</dependency>

<!-- slf4j API frontend binding with JUL backend -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.14</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-jdk14</artifactId>
<version>1.7.14</version>
</dependency>

<dependency>
<groupId>com.google.apis</groupId>
<artifactId>google-api-services-bigquery</artifactId>
<version>${bigquery.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava-jdk5</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>22.0</version>
</dependency>

<dependency>
<groupId>com.google.oauth-client</groupId>
<artifactId>google-oauth-client</artifactId>
<version>1.21.0</version>
</dependency>
<dependency>
<groupId>com.google.http-client</groupId>
<artifactId>google-http-client-jackson2</artifactId>
<version>1.21.0</version>
</dependency>
<dependency>
<groupId>com.google.oauth-client</groupId>
<artifactId>google-oauth-client-jetty</artifactId>
<version>1.21.0</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.7</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.truth</groupId>
<artifactId>truth</artifactId>
<version>0.29</version>
<scope>test</scope>
</dependency>


</dependencies>

数据流需要 GOOGLE_APPLICATION_CREDENTIALS json 文件才能执行此代码?

I don't found the method list tables of the exist Dataset with BigQuery IO. Reference post

我可以请你帮我吗? –

最佳答案

这是一个连接错误。我建议首先验证您是否正确设置了凭据,如其他答案中所述,当仍然收到此错误时,请执行以下操作:

首先,此错误表明 TCP 套接字在服务器能够发送响应之前已关闭。一些可能的原因是:

  • 网络连接丢失
  • 服务器决定关闭连接
  • 客户端和服务器之间的某些事情终止了请求

考虑使用 netstat/traceroute/ping 等工具来查看是否可以找到案例 1 和 3 的路径问题。

如果服务器是关闭连接的服务器,那么您的请求可能不正确,或者它正在限制您以避免拥塞。在这些情况下,快速重试没有帮助(在某些情况下,如果服务器同时出现问题,那么重试会有所帮助)。您可以尝试指数退避和重试策略,如果不起作用,那么您的请求可能不是正确的。

关于java - Dataflow 如何与 BIgQuery 数据集配合使用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51101842/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com