net.maizegenetics.pangenome.api.GraphUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.api;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * @author Terry Casstevens Created August 30, 2017
 */
public class GraphUtils {

    private GraphUtils() {
        // utility
    }

    /**
     * Returns sorted set of haplotype ids from the given paths.
     *
     * @param paths paths
     *
     * @return sorted set of haplotype ids
     */
    public static SortedSet path(TreeMap paths) {

        ImmutableSortedSet.Builder result = ImmutableSortedSet.naturalOrder();
        for (HaplotypePath path : paths.values()) {
            for (HaplotypeNode node : path.nodes()) {
                result.add(node.id());
            }
        }
        return result.build();

    }

    public static SortedSet pathsToNodes(TreeMap paths) {

        ImmutableSortedSet.Builder result = ImmutableSortedSet.naturalOrder();
        for (HaplotypePath path : paths.values()) {
            for (HaplotypeNode node : path.nodes()) {
                result.add(node);
            }
        }
        return result.build();

    }

    /**
     * Returns a list of HaplotypeNodes corresponding to the given hapids in the given graph.
     *
     * @param graph graph
     * @param hapids haplotype ids
     *
     * @return list of HaplotypeNodes
     */
    public static List nodes(HaplotypeGraph graph, SortedSet hapids) {
        return graph.nodeStream().parallel().filter(node -> hapids.contains(node.id())).collect(Collectors.toList());
    }

    public static void presenceAbsenceTaxonByNode(HaplotypeGraph graph, String filename) {

        TaxaList taxa = graph.taxaInGraph();

        try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {

            writer.write("interval\thaplotypes_id");
            for (Taxon taxon : taxa) {
                writer.write("\t");
                writer.write(taxon.getName());
            }
            writer.write("\n");

            graph.referenceRanges().forEach(range -> {
                for (HaplotypeNode node : graph.nodes(range)) {
                    try {
                        writer.write(range.intervalString());
                        writer.write("\t");
                        writer.write(String.valueOf(node.id()));
                        for (Taxon taxon : taxa) {
                            writer.write("\t");
                            if (node.taxaList().contains(taxon)) {
                                writer.write("1");
                            } else {
                                writer.write("0");
                            }
                        }
                        writer.write("\n");
                    } catch (Exception ex) {
                        throw new IllegalStateException("GraphUtils: presenceAbsenceTaxonByNode: problem writing: " + filename + "\n" + ex.getMessage());
                    }
                }
            });

        } catch (Exception e) {
            throw new IllegalStateException("GraphUtils: presenceAbsenceTaxonByNode: problem writing: " + filename + "\n" + e.getMessage());
        }

    }

    public static int[] distributionTaxaRepresented(HaplotypeGraph graph) {

        int totalNumTaxa = graph.totalNumberTaxa();

        return graph.referenceRangeStream().mapToInt(range -> graph.numberTaxa(range))
                .collect(() -> new int[totalNumTaxa], (ints, value) -> ints[value]++, (ints, ints2) -> {
                    for (int i = 0; i < totalNumTaxa; i++) {
                        ints[i] += ints2[i];
                    }
                });

    }

    /*
    public static int[] cumulativeDistributionTaxaRepresented(HaplotypeGraph graph) {

        int[] result = new int[graph.totalNumberTaxa() + 1];
        for (ReferenceRange range : graph.referenceRanges()) {
            int numTaxa = graph.numberTaxa(range);
            for (int i = 0; i <= numTaxa; i++) {
                result[i]++;
            }
        }
        return result;

    }
    */

    /**
     * Gets list of HaplotypeNodes containing exact list of taxa.
     *
     * @param taxa taxa list
     * @param graph graph
     *
     * @return list of nodes
     */
    public static List nodesContainingExactly(TaxaList taxa, HaplotypeGraph graph) {

        int numInputTaxa = taxa.numberOfTaxa();

        return graph.referenceRangeStream()
                .flatMap(range -> graph.nodes(range).stream())
                .filter(node -> node.numTaxa() == numInputTaxa)
                .filter(node -> {
                    boolean keep = true;
                    for (Taxon taxon : taxa) {
                        if (!node.taxaList().contains(taxon)) {
                            keep = false;
                            break;
                        }
                    }
                    return keep;
                })
                .collect(Collectors.toList());

    }

    public static void tagNodePairCounts(String tagTaxaFile, HaplotypeGraph graph, String outputFile) {

        // Map of tags to list of taxa names
        Multimap tagToTaxa = HashMultimap.create();
        try (BufferedReader reader = Utils.getBufferedReader(tagTaxaFile)) {
            String line = reader.readLine();
            if (line.startsWith("Sample")) {
                line = reader.readLine();
            }
            while (line != null) {
                String[] tokens = line.split("\t");
                String taxon = tokens[0];
                String tag = tokens[1];
                tagToTaxa.put(tag, taxon);
                line = reader.readLine();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        // Map of taxon to hapids
        Multimap taxonToNode = graph.referenceRangeStream()
                .flatMap(range -> graph.nodes(range).stream())
                .collect(() -> HashMultimap.create(), (map, node) -> {
                    for (Taxon taxon : node.taxaList()) {
                        map.put(taxon.getName(), node.id());
                    }
                }, (BiConsumer, Multimap>) (map, map2) -> map.putAll(map2));

        try (BufferedWriter writer = Utils.getBufferedWriter(outputFile)) {
            writer.write("tag\thapid\tcount\n");
            for (String tag : tagToTaxa.keySet()) {
                Multiset counts = TreeMultiset.create();
                for (String taxon : tagToTaxa.get(tag)) {
                    for (Integer hapid : taxonToNode.get(taxon)) {
                        counts.add(hapid);
                    }
                }
                for (Integer hapid : counts.elementSet()) {
                    writer.write(tag);
                    writer.write("\t");
                    writer.write(String.valueOf(hapid));
                    writer.write("\t");
                    writer.write(String.valueOf(counts.count(hapid)));
                    writer.write("\n");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    public static void nodeTaxaPairs(HaplotypeGraph graph, String filename) {

        try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {
            writer.write("interval\thaplotypes_id\tsample\n");
            graph.nodeStream().forEach(node -> {
                String interval = node.referenceRange().intervalString();
                String hapID = String.valueOf(node.id());
                for (Taxon taxon : node.taxaList()) {
                    try {
                        writer.write(interval);
                        writer.write("\t");
                        writer.write(hapID);
                        writer.write("\t");
                        writer.write(taxon.getName());
                        writer.write("\n");
                    } catch (Exception ex) {
                        ex.printStackTrace();
                    }
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * Return sorted set of positions that are variant (SNP) positions in given graph.
     *
     * @param graph graph
     *
     * @return sorted set of positions
     */
    public static SortedSet snpPositions(HaplotypeGraph graph) {
        return snpPositions(graph, null);
    }

    /**
     * Return sorted set of positions that are variant (SNP) positions in given graph.
     *
     * @param graph graph
     * @param referenceRanges set of reference ranges (ids) that are included in result.
     * If null, all ranges are included.
     *
     * @return sorted set of positions
     */
    public static SortedSet snpPositions(HaplotypeGraph graph, Collection referenceRanges) {

        return graph.nodeStream().parallel()
                .filter(node -> referenceRanges == null || referenceRanges.contains(node.referenceRange().id()))
                .map(node -> node.variantInfos())
                .filter(contexts -> contexts.isPresent())
                .flatMap((Function>, Stream>) contexts -> contexts.get().stream())
                .filter(context -> {
                    // TODO undo indel filtering
                    return context.isVariant() && !context.isIndel();
                })
                .collect(() -> new TreeSet<>(),
                        (positions, context) -> {
                            String reference = context.refAlleleString();
                            //String alternate = context.getAlternateAllele(0).getBaseString();

                            int start = context.start();
                            for (int i = start; i <= context.end(); i++) {

                                GeneralPosition.Builder builder = Position.builder(context.chromosome(), i)
                                        .allele(WHICH_ALLELE.Reference, NucleotideAlignmentConstants.getNucleotideAlleleByte(reference.charAt(i - start)));
                                //.allele(WHICH_ALLELE.Alternate, NucleotideAlignmentConstants.getNucleotideAlleleByte(alternate.charAt(i - start)));
                                positions.add(builder.build());

                            }
                        },
                        (BiConsumer, SortedSet>) (positions, positions2) -> positions.addAll(positions2));

    }

}