com.google.cloud.genomics.dataflow.functions.OutputPCoAFile Maven / Gradle / Ivy
/*
* Copyright (C) 2014 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.genomics.dataflow.functions;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.common.collect.BiMap;
import java.util.ArrayList;
import java.util.List;
/**
* Given a set of similar pair counts, this function aggregates the counts,
* runs Principal Coordinate Analysis, and writes the result to a tab-delimited GCS file which
* can be imported into Google Spreadsheets and rendered with a bubble graph.
*
* The input data must be for a similarity matrix which will be symmetric. This is not
* the same as Principal Component Analysis.
*/
public class OutputPCoAFile extends PTransform, Long>>, PDone> {
private static final Combine.CombineFn, Long>,
List, Long>>, Iterable, Long>>> TO_LIST =
toList();
private static Combine.CombineFn, Iterable> toList() {
return new Combine.CombineFn, Iterable>() {
@Override public List addInput(List accumulator, X input) {
accumulator.add(input);
return accumulator;
}
@Override public List createAccumulator() {
return new ArrayList<>();
}
@Override public Iterable extractOutput(List accumulator) {
return accumulator;
}
@Override public List mergeAccumulators(
Iterable> accumulators) {
List merged = new ArrayList<>();
for (List accumulator : accumulators) {
merged.addAll(accumulator);
}
return merged;
}
};
}
private BiMap dataIndices;
private final String outputFile;
public OutputPCoAFile(BiMap dataIndices, String outputFile) {
this.dataIndices = dataIndices;
this.outputFile = outputFile;
}
@Override
public PDone apply(PCollection, Long>> similarPairs) {
return similarPairs
.apply(Sum.>longsPerKey())
.apply(Combine.globally(TO_LIST))
.apply(ParDo.named("PCoAAnalysis").of(new PCoAnalysis(dataIndices)))
.apply(ParDo.named("FormatGraphData")
.of(new DoFn, String>() {
@Override
public void processElement(ProcessContext c) throws Exception {
Iterable graphResults = c.element();
for (PCoAnalysis.GraphResult result : graphResults) {
c.output(result.toString());
}
}
}))
.apply(TextIO.Write.named("WriteCounts").to(outputFile));
}
}