Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
net.maizegenetics.taxa.TaxaListIOUtils Maven / Gradle / Ivy
package net.maizegenetics.taxa;
import com.google.common.base.Splitter;
import com.google.common.collect.*;
import net.maizegenetics.util.TableReportUtils;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.*;
/**
* Utilities for reading and writing IdGroup and PedigreeIdGroups.
*
* @author Ed Buckler
*/
public class TaxaListIOUtils {
private static final Logger myLogger = LogManager.getLogger(TaxaListIOUtils.class);
private static final String DELIMITER = "\t";
private TaxaListIOUtils() {
}
/**
* Create a Multimap of all the taxa associated with a particular annotation
* value.
*
* @param taxaList input taxa list with annotation associated with
* @param annotation annotation key used to create the multimap, the values
* of these keys become the key of the resulting Multimap
*
* @return Map of AnnotationValues -> Taxon
*/
public static Multimap getMapOfTaxonByAnnotation(TaxaList taxaList, String annotation) {
ImmutableMultimap.Builder annoMap = new ImmutableMultimap.Builder().orderKeysBy(Ordering.natural());
for (Taxon taxon : taxaList) {
for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
annoMap.put(value, taxon);
}
}
return annoMap.build();
}
/**
* Create a Map of all the taxa associated with a particular annotation
* value. If there would be a duplicate mapping, then an Optional.empty() is returned.
*
* @param taxaList input taxa list with annotation associated with
* @param annotation annotation key used to create the map, the values
* of these keys become the key of the resulting map
*
* @return Map of AnnotationValues -> Taxon
*/
public static Optional> getUniqueMapOfTaxonByAnnotation(TaxaList taxaList, String annotation) {
Map annoMap = new TreeMap<>();
for (Taxon taxon : taxaList) {
for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
if (annoMap.containsKey(value)) return Optional.empty();
annoMap.put(value, taxon);
}
}
return Optional.of(ImmutableSortedMap.copyOf(annoMap));
}
/**
* Returns a subsetted taxa list based on annotation value. For example,
* return all taxa where {@literal GermType=Inbred}.
*
* @param baseTaxaList base annotated taxa list
* @param annotation annotation name (key)
* @param annoValue annotation value being tested for
*
* @return TaxaList equal to the annotation value
*/
public static TaxaList subsetTaxaListByAnnotation(TaxaList baseTaxaList, String annotation, String annoValue) {
TaxaListBuilder tlb = new TaxaListBuilder();
for (Taxon taxon : baseTaxaList) {
for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
if (value.equals(annoValue)) {
tlb.add(taxon);
break;
}
}
}
return tlb.build();
}
/**
* Creates a new taxa list with the taxa only retaining annotations within a
* specified list. All taxa are retained, only the annotations are changed.
*
* @param baseTaxaList
* @param annotationsToKeep the retained keys annotation
*
* @return new TaxaList with a subset of the annotations
*/
public static TaxaList retainSpecificAnnotations(TaxaList baseTaxaList, String[] annotationsToKeep) {
Set keepers = new ImmutableSet.Builder().addAll(Arrays.asList(annotationsToKeep)).build();
TaxaListBuilder tlb = new TaxaListBuilder();
for (Taxon taxon : baseTaxaList) {
Taxon.Builder tb = new Taxon.Builder(taxon.getName());
for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
if (keepers.contains(entry.getKey())) {
tb.addAnno(entry.getKey(), entry.getValue());
}
}
tlb.add(tb.build());
}
return tlb.build();
}
/**
* Creates a new taxa list with the taxa retaining annotations EXCEPT those
* specified by the list. All taxa are retained, only the annotations are
* changed.
*
* @param baseTaxaList
* @param annotationsToRemove the retained keys annotation
*
* @return new TaxaList with a subset of the annotations
*/
public static TaxaList removeSpecificAnnotations(TaxaList baseTaxaList, String[] annotationsToRemove) {
Set keepers = new ImmutableSet.Builder().addAll(Arrays.asList(annotationsToRemove)).build();
TaxaListBuilder tlb = new TaxaListBuilder();
for (Taxon taxon : baseTaxaList) {
Taxon.Builder tb = new Taxon.Builder(taxon.getName());
for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
if (!keepers.contains(entry.getKey())) {
tb.addAnno(entry.getKey(), entry.getValue());
}
}
tlb.add(tb.build());
}
return tlb.build();
}
/**
* Provides the set of all annotation key found in any of taxa
*
* @param baseTaxaList
*
* @return
*/
public static Set allAnnotationKeys(TaxaList baseTaxaList) {
ImmutableSet.Builder keepers = new ImmutableSet.Builder();
for (Taxon taxon : baseTaxaList) {
for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
keepers.add(entry.getKey());
}
}
return keepers.build();
}
public static void exportAnnotatedTaxaListTable(TaxaList taxa, String filename) {
try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {
writer.append("\n");
TableReportUtils.saveDelimitedTableReport(new TaxaListTableReport(taxa), DELIMITER, writer, true);
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("TaxaListIOUtils: exportAnnotatedTaxaListTable: problem saving file: " + filename);
}
}
public static TaxaList importAnnotatedTaxaList(String filename) {
TaxaListBuilder builder = new TaxaListBuilder();
try (BufferedReader reader = Utils.getBufferedReader(filename)) {
String header = reader.readLine().trim();
if (!header.equalsIgnoreCase("")) {
throw new IllegalArgumentException("TaxaListIOUtils: importAnnotatedTaxaList: This file doesn't start with : " + filename);
}
header = reader.readLine().trim();
String[] columns = header.split(DELIMITER);
for (int i = 0; i < columns.length; i++) {
columns[i] = columns[i].trim();
}
if (!columns[0].equalsIgnoreCase("Taxa")) {
throw new IllegalArgumentException("TaxaListIOUtils: importAnnotatedTaxaList: First column should be Taxa: " + filename);
}
int numColumns = columns.length;
int lineNum = 2;
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
lineNum++;
String[] annotations = line.split(DELIMITER);
if (numColumns != annotations.length) {
throw new IllegalStateException("TaxaListIOUtils: importAnnotatedTaxaList: number of annotations doesn't match number of columns line: " + lineNum + " taxon: " + annotations[0].trim());
}
Taxon.Builder currentTaxon = new Taxon.Builder(annotations[0].trim());
for (int i = 1; i < numColumns; i++) {
String value = annotations[i].trim();
if (!value.isEmpty()) {
currentTaxon.addAnno(columns[i], value);
}
}
builder.add(currentTaxon.build());
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("TaxaListIOUtils: importAnnotatedTaxaList: Problem reading file: " + filename + "\n" + e.getMessage());
}
return builder.build();
}
/**
* Returns an annotated TaxaList from a text annotation file in matrix
* format. This is a tab delimited file. First row in the file with the
* field {@literal taxaNameField} is the header row.
* {@literal taxaNameField} indicated the taxon name, all other fields are
* user defined. The fields become the keys for the taxa annotation.
* Quantitative fields should be tagged with "#" sign, e.g.
* {@literal <#INBREEDF>}. Multiple values are supported per key, and
* additional values can be either described with an additional column or
* ";" to delimit values with the same key.
*
*
* Filters are a map of filters to be applied. Key are the fields, and value
* are what are tested for equality. Only taxa rows true for filters are
* retained.
*
*
{@literal <#InbreedF> }
* {@literal B73 Inbred Goodman282 0.98 ISU;IBMFounder}
* {@literal MO17 Inbred Goodman282 0.98 UMC;IBMFounder}
*
*
* Produces:
* {@literal B73}
* {@literal MO17}
* The standardized keys are described in the
* {@link net.maizegenetics.taxa.Taxon}, and these constant fields are all
* upper case.
*
* @param fileName with complete path
* @param taxaNameField field name with the taxon name
* @param filters Map of filter to determine which rows to retain as the
* file is processed.
*
* @return TaxaList with annotations
*/
// New version calls readTaxaAnnotationFileAL, then creates TaxaList from ArrayList
public static TaxaList readTaxaAnnotationFile(String fileName, String taxaNameField, Map filters, boolean mergeSameNames) {
// create list
ArrayList taxaAL = readTaxaAnnotationFileAL(fileName, taxaNameField, filters);
if (taxaAL == null) return null;
TaxaListBuilder tlb = new TaxaListBuilder();
taxaAL.stream().forEach(taxa -> {
if (mergeSameNames) {
tlb.addOrMerge(taxa);
} else {
tlb.add(taxa);
}
});
return tlb.sortTaxaAlphabetically().build();
}
// Version of readTaxaAnnotationFile that returns an ArrayList. This is called
// from places that wish to allow duplicate taxa. The TaxaList does not allow for
// duplicate entries. An ArrayList does.
public static ArrayList readTaxaAnnotationFileAL(String fileName, String taxaNameField, Map filters) {
try {
BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000);
fileIn.mark(1 << 16);
String line = fileIn.readLine();
ArrayList taxaAL = new ArrayList();
int indexOfName = 0;
//parse headers
List headers = new ArrayList<>();
List isQuant = new ArrayList<>();
if (line.contains(taxaNameField)) {
int i = 0;
for (String header : line.split("\\t")) {
if (header.equals(taxaNameField)) {
indexOfName = i;
}
isQuant.add(header.startsWith("#") || header.startsWith("<#"));
headers.add(header.replace(">", "").replace("<", "").replace("#", ""));
i++;
}
} else {
fileIn.reset();
}
//parse taxa rows
while ((line = fileIn.readLine()) != null) {
String[] s = line.split("\\t");
Taxon.Builder anID = new Taxon.Builder(s[indexOfName]);
for (int i = 0; i < s.length; i++) {
if (i == indexOfName) {
continue;
}
String[] cs = s[i].split(";");
for (String ta : cs) {
if (ta == null || ta.isEmpty()) {
continue;
}
if (isQuant.get(i)) {
if (ta.equals("NA")) {
anID.addAnno(headers.get(i), Double.NaN);
} else {
anID.addAnno(headers.get(i), Double.parseDouble(ta));
}
} else {
anID.addAnno(headers.get(i), ta);
}
}
}
Taxon t = anID.build();
if (doesTaxonHaveAllAnnotations(t, filters)) {
taxaAL.add(t);
}
}
// Sort alphabetically based on name. This is to remain consistent
// with the readTaxaAnnotationFile(), which alphabetizes the taxaList
Collections.sort(taxaAL, new Comparator() {
public int compare(Taxon taxa1, Taxon taxa2) {
return taxa1.getName().compareTo(taxa2.getName());
}
});
return taxaAL;
} catch (Exception e) {
System.err.println("Error in Reading Annotated Taxon File:" + fileName);
e.printStackTrace();
}
return null;
}
/**
* Returns an annotated TaxaList from a text annotation file in matrix
* format. This is a tab delimited file. First row in the file with the
* field {@literal taxaNameField} is the header row.
* {@literal taxaNameField} indicated the taxon name, all other fields are
* user defined. The fields become the keys for the taxa annotation.
* Quantitative fields should be tagged with "#" sign, e.g.
* {@literal <#INBREEDF>}. Multiple values are supported per key, and
* additional values can be either described with an additional column or
* ";" to delimit values with the same key.
*
*
{@literal <#InbreedF> }
* {@literal B73 Inbred Goodman282 0.98 ISU;IBMFounder}
* {@literal MO17 Inbred Goodman282 0.98 UMC;IBMFounder}
*
*
* Produces:
* {@literal B73}
* {@literal MO17}
* The standardized keys are described in the
* {@link net.maizegenetics.taxa.Taxon}, and these constant fields are all
* upper case.
*
* @param fileName with complete path
* @param taxaNameField field name with the taxon name
*
* @return TaxaList with annotations
*/
public static TaxaList readTaxaAnnotationFile(String fileName, String taxaNameField) {
return readTaxaAnnotationFile(fileName, taxaNameField, new HashMap(), false);
}
/**
* Tests whether a taxon has annotation values in the map
*
* @param taxon
* @param filters
*
* @return true if all present, false is otherwise
*/
public static boolean doesTaxonHaveAllAnnotations(Taxon taxon, Map filters) {
SetMultimap taxonAnno = taxon.getAnnotation().getAnnotationAsMap();
boolean keep = true;
for (Map.Entry entry : filters.entrySet()) {
keep = false;
for (String s1 : taxonAnno.get(entry.getKey())) {
if (s1.equals(entry.getValue())) {
keep = true;
}
}
if (keep == false) {
break;
}
}
return keep;
}
/**
* Parses a VCF header with the taxa names and annotations into a multimap.
* The taxa name is return as the "ID" key, as used by the VCF format.
*
* @param s
*
* @return
*/
public static SetMultimap parseVCFHeadersIntoMap(String s) {
if (s == null) {
return null;
}
if (!(s.startsWith("<") && s.endsWith(">"))) {
return null;
}
String value = s.substring(1, s.length() - 1);
ImmutableSetMultimap.Builder im = new ImmutableSetMultimap.Builder()
.orderKeysBy(Ordering.natural()).orderValuesBy(Ordering.natural());
for (String s1 : Splitter.on(",").trimResults().split(value)) {
String[] ssEntry = s1.split("=", 2);
im.put(ssEntry[0], ssEntry[1]);
}
return im.build();
}
/**
* This method takes a key file and creates a SortedSet that
* contains a set of the tissue values. The set will be null if
* no tissues are present
*
* @param fileName - name of Keyfile containing Tissue header
* @param tissueNameField - field name
*
* @return
*/
public static List readTissueAnnotationFile(String fileName, String tissueNameField) {
try {
BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000);
fileIn.mark(1 << 16);
String line = fileIn.readLine();
List tissues = new ArrayList();
int indexOfTissue = -1;
//parse headers
if (line.contains(tissueNameField)) {
int idx = 0;
for (String header : line.split("\\t")) {
if (header.equals(tissueNameField)) {
indexOfTissue = idx;
break;
}
idx++;
}
if (indexOfTissue == -1) {
// Tissue header not found - return null
return null;
}
} else {
fileIn.reset();
}
if (indexOfTissue == -1) return null;
// Found tissue header, read values - no duplicates, into set
while ((line = fileIn.readLine()) != null) {
String[] items = line.split("\\t");
for (int idx = 0; idx < items.length; idx++) {
if (idx == indexOfTissue) {
if (!(tissues.contains(items[idx]))) {
tissues.add(items[idx]);
}
continue;
}
}
}
Collections.sort(tissues);
return tissues;
} catch (Exception e) {
System.err.println("Error in Reading Annotated Tissue File:" + fileName);
e.printStackTrace();
}
return null;
}
}