net.maizegenetics.pangenome.api.GraphUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.api;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* @author Terry Casstevens Created August 30, 2017
*/
public class GraphUtils {
private GraphUtils() {
// utility
}
/**
* Returns sorted set of haplotype ids from the given paths.
*
* @param paths paths
*
* @return sorted set of haplotype ids
*/
public static SortedSet path(TreeMap paths) {
ImmutableSortedSet.Builder result = ImmutableSortedSet.naturalOrder();
for (HaplotypePath path : paths.values()) {
for (HaplotypeNode node : path.nodes()) {
result.add(node.id());
}
}
return result.build();
}
public static SortedSet pathsToNodes(TreeMap paths) {
ImmutableSortedSet.Builder result = ImmutableSortedSet.naturalOrder();
for (HaplotypePath path : paths.values()) {
for (HaplotypeNode node : path.nodes()) {
result.add(node);
}
}
return result.build();
}
/**
* Returns a list of HaplotypeNodes corresponding to the given hapids in the given graph.
*
* @param graph graph
* @param hapids haplotype ids
*
* @return list of HaplotypeNodes
*/
public static List nodes(HaplotypeGraph graph, SortedSet hapids) {
return graph.nodeStream().parallel().filter(node -> hapids.contains(node.id())).collect(Collectors.toList());
}
public static void presenceAbsenceTaxonByNode(HaplotypeGraph graph, String filename) {
TaxaList taxa = graph.taxaInGraph();
try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {
writer.write("interval\thaplotypes_id");
for (Taxon taxon : taxa) {
writer.write("\t");
writer.write(taxon.getName());
}
writer.write("\n");
graph.referenceRanges().forEach(range -> {
for (HaplotypeNode node : graph.nodes(range)) {
try {
writer.write(range.intervalString());
writer.write("\t");
writer.write(String.valueOf(node.id()));
for (Taxon taxon : taxa) {
writer.write("\t");
if (node.taxaList().contains(taxon)) {
writer.write("1");
} else {
writer.write("0");
}
}
writer.write("\n");
} catch (Exception ex) {
throw new IllegalStateException("GraphUtils: presenceAbsenceTaxonByNode: problem writing: " + filename + "\n" + ex.getMessage());
}
}
});
} catch (Exception e) {
throw new IllegalStateException("GraphUtils: presenceAbsenceTaxonByNode: problem writing: " + filename + "\n" + e.getMessage());
}
}
public static int[] distributionTaxaRepresented(HaplotypeGraph graph) {
int totalNumTaxa = graph.totalNumberTaxa();
return graph.referenceRangeStream().mapToInt(range -> graph.numberTaxa(range))
.collect(() -> new int[totalNumTaxa], (ints, value) -> ints[value]++, (ints, ints2) -> {
for (int i = 0; i < totalNumTaxa; i++) {
ints[i] += ints2[i];
}
});
}
/*
public static int[] cumulativeDistributionTaxaRepresented(HaplotypeGraph graph) {
int[] result = new int[graph.totalNumberTaxa() + 1];
for (ReferenceRange range : graph.referenceRanges()) {
int numTaxa = graph.numberTaxa(range);
for (int i = 0; i <= numTaxa; i++) {
result[i]++;
}
}
return result;
}
*/
/**
* Gets list of HaplotypeNodes containing exact list of taxa.
*
* @param taxa taxa list
* @param graph graph
*
* @return list of nodes
*/
public static List nodesContainingExactly(TaxaList taxa, HaplotypeGraph graph) {
int numInputTaxa = taxa.numberOfTaxa();
return graph.referenceRangeStream()
.flatMap(range -> graph.nodes(range).stream())
.filter(node -> node.numTaxa() == numInputTaxa)
.filter(node -> {
boolean keep = true;
for (Taxon taxon : taxa) {
if (!node.taxaList().contains(taxon)) {
keep = false;
break;
}
}
return keep;
})
.collect(Collectors.toList());
}
public static void tagNodePairCounts(String tagTaxaFile, HaplotypeGraph graph, String outputFile) {
// Map of tags to list of taxa names
Multimap tagToTaxa = HashMultimap.create();
try (BufferedReader reader = Utils.getBufferedReader(tagTaxaFile)) {
String line = reader.readLine();
if (line.startsWith("Sample")) {
line = reader.readLine();
}
while (line != null) {
String[] tokens = line.split("\t");
String taxon = tokens[0];
String tag = tokens[1];
tagToTaxa.put(tag, taxon);
line = reader.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
// Map of taxon to hapids
Multimap taxonToNode = graph.referenceRangeStream()
.flatMap(range -> graph.nodes(range).stream())
.collect(() -> HashMultimap.create(), (map, node) -> {
for (Taxon taxon : node.taxaList()) {
map.put(taxon.getName(), node.id());
}
}, (BiConsumer, Multimap>) (map, map2) -> map.putAll(map2));
try (BufferedWriter writer = Utils.getBufferedWriter(outputFile)) {
writer.write("tag\thapid\tcount\n");
for (String tag : tagToTaxa.keySet()) {
Multiset counts = TreeMultiset.create();
for (String taxon : tagToTaxa.get(tag)) {
for (Integer hapid : taxonToNode.get(taxon)) {
counts.add(hapid);
}
}
for (Integer hapid : counts.elementSet()) {
writer.write(tag);
writer.write("\t");
writer.write(String.valueOf(hapid));
writer.write("\t");
writer.write(String.valueOf(counts.count(hapid)));
writer.write("\n");
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void nodeTaxaPairs(HaplotypeGraph graph, String filename) {
try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {
writer.write("interval\thaplotypes_id\tsample\n");
graph.nodeStream().forEach(node -> {
String interval = node.referenceRange().intervalString();
String hapID = String.valueOf(node.id());
for (Taxon taxon : node.taxaList()) {
try {
writer.write(interval);
writer.write("\t");
writer.write(hapID);
writer.write("\t");
writer.write(taxon.getName());
writer.write("\n");
} catch (Exception ex) {
ex.printStackTrace();
}
}
});
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Return sorted set of positions that are variant (SNP) positions in given graph.
*
* @param graph graph
*
* @return sorted set of positions
*/
public static SortedSet snpPositions(HaplotypeGraph graph) {
return snpPositions(graph, null);
}
/**
* Return sorted set of positions that are variant (SNP) positions in given graph.
*
* @param graph graph
* @param referenceRanges set of reference ranges (ids) that are included in result.
* If null, all ranges are included.
*
* @return sorted set of positions
*/
public static SortedSet snpPositions(HaplotypeGraph graph, Collection referenceRanges) {
return graph.nodeStream().parallel()
.filter(node -> referenceRanges == null || referenceRanges.contains(node.referenceRange().id()))
.map(node -> node.variantInfos())
.filter(contexts -> contexts.isPresent())
.flatMap((Function>, Stream>) contexts -> contexts.get().stream())
.filter(context -> {
// TODO undo indel filtering
return context.isVariant() && !context.isIndel();
})
.collect(() -> new TreeSet<>(),
(positions, context) -> {
String reference = context.refAlleleString();
//String alternate = context.getAlternateAllele(0).getBaseString();
int start = context.start();
for (int i = start; i <= context.end(); i++) {
GeneralPosition.Builder builder = Position.builder(context.chromosome(), i)
.allele(WHICH_ALLELE.Reference, NucleotideAlignmentConstants.getNucleotideAlleleByte(reference.charAt(i - start)));
//.allele(WHICH_ALLELE.Alternate, NucleotideAlignmentConstants.getNucleotideAlleleByte(alternate.charAt(i - start)));
positions.add(builder.build());
}
},
(BiConsumer, SortedSet>) (positions, positions2) -> positions.addAll(positions2));
}
}