Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
Google Cloud Dataflow Java SDK provides a simple, Java-based
interface for processing virtually any size data using Google cloud
resources. This artifact includes entire Dataflow Java SDK.
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.api.services.datastore.DatastoreV1.PropertyFilter.Operator.EQUAL;
import static com.google.api.services.datastore.DatastoreV1.PropertyOrder.Direction.DESCENDING;
import static com.google.api.services.datastore.DatastoreV1.QueryResultBatch.MoreResultsType.NOT_FINISHED;
import static com.google.api.services.datastore.client.DatastoreHelper.getPropertyMap;
import static com.google.api.services.datastore.client.DatastoreHelper.makeFilter;
import static com.google.api.services.datastore.client.DatastoreHelper.makeOrder;
import static com.google.api.services.datastore.client.DatastoreHelper.makeValue;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Verify.verify;
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.Sleeper;
import com.google.api.services.datastore.DatastoreV1.CommitRequest;
import com.google.api.services.datastore.DatastoreV1.Entity;
import com.google.api.services.datastore.DatastoreV1.EntityResult;
import com.google.api.services.datastore.DatastoreV1.Key;
import com.google.api.services.datastore.DatastoreV1.Key.PathElement;
import com.google.api.services.datastore.DatastoreV1.PartitionId;
import com.google.api.services.datastore.DatastoreV1.Query;
import com.google.api.services.datastore.DatastoreV1.QueryResultBatch;
import com.google.api.services.datastore.DatastoreV1.RunQueryRequest;
import com.google.api.services.datastore.DatastoreV1.RunQueryResponse;
import com.google.api.services.datastore.client.Datastore;
import com.google.api.services.datastore.client.DatastoreException;
import com.google.api.services.datastore.client.DatastoreFactory;
import com.google.api.services.datastore.client.DatastoreHelper;
import com.google.api.services.datastore.client.DatastoreOptions;
import com.google.api.services.datastore.client.QuerySplitter;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.EntityCoder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
import com.google.cloud.dataflow.sdk.io.Sink.Writer;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineWorkerPoolOptions;
import com.google.cloud.dataflow.sdk.options.GcpOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.util.FluentBackoff;
import com.google.cloud.dataflow.sdk.util.RetryHttpRequestInitializer;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableList;
import com.google.common.primitives.Ints;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import javax.annotation.Nullable;
/**
* {@link DatastoreIO} provides an API to Read and Write {@link PCollection PCollections} of
* Google Cloud Datastore
* {@link Entity} objects.
*
*
Google Cloud Datastore is a fully managed NoSQL data storage service.
* An {@code Entity} is an object in Datastore, analogous to a row in traditional
* database table.
*
*
This API currently requires an authentication workaround. To use {@link DatastoreIO}, users
* must use the {@code gcloud} command line tool to get credentials for Datastore:
*
* $ gcloud auth login
*
*
*
To read a {@link PCollection} from a query to Datastore, use {@link DatastoreIO#source} and
* its methods {@link DatastoreIO.Source#withDataset} and {@link DatastoreIO.Source#withQuery} to
* specify the dataset to query and the query to read from. You can optionally provide a namespace
* to query within using {@link DatastoreIO.Source#withNamespace} or a Datastore host using
* {@link DatastoreIO.Source#withHost}.
*
*
{@code
* // Read a query from Datastore using the default namespace and host
* PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
* Query query = ...;
* String dataset = "...";
*
* Pipeline p = Pipeline.create(options);
* PCollection entities = p.apply(DatastoreIO.readFrom(datasetId, query));
* p.run();
* }
*
*
Note: Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel across
* many workers. However, when the {@link Query} is configured with a limit using
* {@link com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then
* all returned results will be read by a single Dataflow worker in order to ensure correct data.
*
*
To write a {@link PCollection} to a Datastore, use {@link DatastoreIO#writeTo},
* specifying the datastore to write to:
*
*
To optionally change the host that is used to write to the Datastore, use {@link
* DatastoreIO#sink} to build a {@link DatastoreIO.Sink} and write to it using the {@link Write}
* transform:
*
*
{@link Entity Entities} in the {@code PCollection} to be written must have complete
* {@link Key Keys}. Complete {@code Keys} specify the {@code name} and {@code id} of the
* {@code Entity}, where incomplete {@code Keys} do not. A {@code namespace} other than the
* project default may be written to by specifying it in the {@code Entity} {@code Keys}.
*
*
{@code Entities} will be committed as upsert (update or insert) mutations. Please read
* Entities, Properties, and
* Keys for more information about {@code Entity} keys.
*
*
Permissions
* Permission requirements depend on the {@code PipelineRunner} that is used to execute the
* Dataflow job. Please refer to the documentation of corresponding {@code PipelineRunner}s for
* more details.
*
*
Please see Cloud Datastore Sign Up
* for security and permission related information specific to Datastore.
*
* @see com.google.cloud.dataflow.sdk.runners.PipelineRunner
*
* @deprecated replaced by {@link com.google.cloud.dataflow.sdk.io.datastore.DatastoreIO}
*/
@Deprecated
@Experimental(Experimental.Kind.SOURCE_SINK)
public class DatastoreIO {
public static final String DEFAULT_HOST = "https://www.googleapis.com";
/**
* Datastore has a limit of 500 mutations per batch operation, so we flush
* changes to Datastore every 500 entities.
*/
public static final int DATASTORE_BATCH_UPDATE_LIMIT = 500;
/**
* Returns an empty {@link DatastoreIO.Source} builder with the default {@code host}.
* Configure the {@code dataset}, {@code query}, and {@code namespace} using
* {@link DatastoreIO.Source#withDataset}, {@link DatastoreIO.Source#withQuery},
* and {@link DatastoreIO.Source#withNamespace}.
*
* @deprecated the name and return type do not match. Use {@link #source()}.
*/
@Deprecated
public static Source read() {
return source();
}
/**
* Returns an empty {@link DatastoreIO.Source} builder with the default {@code host}.
* Configure the {@code dataset}, {@code query}, and {@code namespace} using
* {@link DatastoreIO.Source#withDataset}, {@link DatastoreIO.Source#withQuery},
* and {@link DatastoreIO.Source#withNamespace}.
*
*
The resulting {@link Source} object can be passed to {@link Read} to create a
* {@code PTransform} that will read from Datastore.
*/
public static Source source() {
return new Source(DEFAULT_HOST, null, null, null);
}
/**
* Returns a {@code PTransform} that reads Datastore entities from the query
* against the given dataset.
*/
public static Read.Bounded readFrom(String datasetId, Query query) {
return Read.from(new Source(DEFAULT_HOST, datasetId, query, null));
}
/**
* Returns a {@code PTransform} that reads Datastore entities from the query
* against the given dataset and host.
*
* @deprecated prefer {@link #source()} with {@link Source#withHost}, {@link Source#withDataset},
* {@link Source#withQuery}s.
*/
@Deprecated
public static Read.Bounded readFrom(String host, String datasetId, Query query) {
return Read.from(new Source(host, datasetId, query, null));
}
/**
* A {@link Source} that reads the result rows of a Datastore query as {@code Entity} objects.
*/
public static class Source extends BoundedSource {
public String getHost() {
return host;
}
public String getDataset() {
return datasetId;
}
public Query getQuery() {
return query;
}
@Nullable
public String getNamespace() {
return namespace;
}
public Source withDataset(String datasetId) {
checkNotNull(datasetId, "datasetId");
return new Source(host, datasetId, query, namespace);
}
/**
* Returns a new {@link Source} that reads the results of the specified query.
*
*
Does not modify this object.
*
*
Note: Normally, a Cloud Dataflow job will read from Cloud Datastore in parallel
* across many workers. However, when the {@link Query} is configured with a limit using
* {@link com.google.api.services.datastore.DatastoreV1.Query.Builder#setLimit(int)}, then all
* returned results will be read by a single Dataflow worker in order to ensure correct data.
*/
public Source withQuery(Query query) {
checkNotNull(query, "query");
checkArgument(!query.hasLimit() || query.getLimit() > 0,
"Invalid query limit %s: must be positive", query.getLimit());
return new Source(host, datasetId, query, namespace);
}
public Source withHost(String host) {
checkNotNull(host, "host");
return new Source(host, datasetId, query, namespace);
}
public Source withNamespace(@Nullable String namespace) {
return new Source(host, datasetId, query, namespace);
}
@Override
public Coder getDefaultOutputCoder() {
return EntityCoder.of();
}
@Override
public boolean producesSortedKeys(PipelineOptions options) {
// TODO: Perhaps this can be implemented by inspecting the query.
return false;
}
@Override
public List