All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.genomics.dataflow.functions.OutputPCoAFile Maven / Gradle / Ivy

/*
 * Copyright (C) 2014 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.functions;

import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.common.collect.BiMap;

import java.util.ArrayList;
import java.util.List;

/**
 * Given a set of similar pair counts, this function aggregates the counts,
 * runs Principal Coordinate Analysis, and writes the result to a tab-delimited GCS file which
 * can be imported into Google Spreadsheets and rendered with a bubble graph.
 *
 * The input data must be for a similarity matrix which will be symmetric. This is not
 * the same as Principal Component Analysis.
*/
public class OutputPCoAFile extends PTransform, Long>>, PDone> {

  private static final Combine.CombineFn, Long>,
      List, Long>>, Iterable, Long>>> TO_LIST =
      toList();

  private static  Combine.CombineFn, Iterable> toList() {
    return new Combine.CombineFn, Iterable>() {

          @Override public List addInput(List accumulator, X input) {
            accumulator.add(input);
            return accumulator;
          }

          @Override public List createAccumulator() {
            return new ArrayList<>();
          }

          @Override public Iterable extractOutput(List accumulator) {
            return accumulator;
          }

          @Override public List mergeAccumulators(
              Iterable> accumulators) {
            List merged = new ArrayList<>();
            for (List accumulator : accumulators) {
              merged.addAll(accumulator);
            }
            return merged;
          }
        };
  }

  private BiMap dataIndices;
  private final String outputFile;

  public OutputPCoAFile(BiMap dataIndices, String outputFile) {
    this.dataIndices = dataIndices;
    this.outputFile = outputFile;
  }

  @Override
  public PDone apply(PCollection, Long>> similarPairs) {
    return similarPairs
        .apply(Sum.>longsPerKey())
        .apply(Combine.globally(TO_LIST))
        .apply(ParDo.named("PCoAAnalysis").of(new PCoAnalysis(dataIndices)))
        .apply(ParDo.named("FormatGraphData")
            .of(new DoFn, String>() {
              @Override
              public void processElement(ProcessContext c) throws Exception {
                Iterable graphResults = c.element();
                for (PCoAnalysis.GraphResult result : graphResults) {
                  c.output(result.toString());
                }
              }
            }))
        .apply(TextIO.Write.named("WriteCounts").to(outputFile));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy