All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.taxa.TaxaListIOUtils Maven / Gradle / Ivy

package net.maizegenetics.taxa;

import com.google.common.base.Splitter;
import com.google.common.collect.*;
import net.maizegenetics.util.TableReportUtils;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.*;

/**
 * Utilities for reading and writing IdGroup and PedigreeIdGroups.
 *
 * @author Ed Buckler
 */
public class TaxaListIOUtils {

    private static final Logger myLogger = LogManager.getLogger(TaxaListIOUtils.class);

    private static final String DELIMITER = "\t";

    private TaxaListIOUtils() {
    }

    /**
     * Create a Multimap of all the taxa associated with a particular annotation
     * value.
     *
     * @param taxaList input taxa list with annotation associated with
     * @param annotation annotation key used to create the multimap, the values
     * of these keys become the key of the resulting Multimap
     *
     * @return Map of AnnotationValues -> Taxon
     */
    public static Multimap getMapOfTaxonByAnnotation(TaxaList taxaList, String annotation) {
        ImmutableMultimap.Builder annoMap = new ImmutableMultimap.Builder().orderKeysBy(Ordering.natural());
        for (Taxon taxon : taxaList) {
            for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
                annoMap.put(value, taxon);
            }
        }
        return annoMap.build();
    }

    /**
     * Create a Map of all the taxa associated with a particular annotation
     * value.  If there would be a duplicate mapping, then an Optional.empty() is returned.
     *
     * @param taxaList input taxa list with annotation associated with
     * @param annotation annotation key used to create the map, the values
     * of these keys become the key of the resulting map
     *
     * @return Map of AnnotationValues -> Taxon
     */
    public static Optional> getUniqueMapOfTaxonByAnnotation(TaxaList taxaList, String annotation) {
        Map annoMap = new TreeMap<>();
        for (Taxon taxon : taxaList) {
            for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
                if (annoMap.containsKey(value)) return Optional.empty();
                annoMap.put(value, taxon);
            }
        }
        return Optional.of(ImmutableSortedMap.copyOf(annoMap));
    }

    /**
     * Returns a subsetted taxa list based on annotation value. For example,
     * return all taxa where {@literal GermType=Inbred}.
     *
     * @param baseTaxaList base annotated taxa list
     * @param annotation annotation name (key)
     * @param annoValue annotation value being tested for
     *
     * @return TaxaList equal to the annotation value
     */
    public static TaxaList subsetTaxaListByAnnotation(TaxaList baseTaxaList, String annotation, String annoValue) {
        TaxaListBuilder tlb = new TaxaListBuilder();
        for (Taxon taxon : baseTaxaList) {
            for (String value : taxon.getAnnotation().getTextAnnotation(annotation)) {
                if (value.equals(annoValue)) {
                    tlb.add(taxon);
                    break;
                }
            }
        }
        return tlb.build();
    }

    /**
     * Creates a new taxa list with the taxa only retaining annotations within a
     * specified list. All taxa are retained, only the annotations are changed.
     *
     * @param baseTaxaList
     * @param annotationsToKeep the retained keys annotation
     *
     * @return new TaxaList with a subset of the annotations
     */
    public static TaxaList retainSpecificAnnotations(TaxaList baseTaxaList, String[] annotationsToKeep) {
        Set keepers = new ImmutableSet.Builder().addAll(Arrays.asList(annotationsToKeep)).build();
        TaxaListBuilder tlb = new TaxaListBuilder();
        for (Taxon taxon : baseTaxaList) {
            Taxon.Builder tb = new Taxon.Builder(taxon.getName());
            for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
                if (keepers.contains(entry.getKey())) {
                    tb.addAnno(entry.getKey(), entry.getValue());
                }
            }
            tlb.add(tb.build());
        }
        return tlb.build();
    }

    /**
     * Creates a new taxa list with the taxa retaining annotations EXCEPT those
     * specified by the list. All taxa are retained, only the annotations are
     * changed.
     *
     * @param baseTaxaList
     * @param annotationsToRemove the retained keys annotation
     *
     * @return new TaxaList with a subset of the annotations
     */
    public static TaxaList removeSpecificAnnotations(TaxaList baseTaxaList, String[] annotationsToRemove) {
        Set keepers = new ImmutableSet.Builder().addAll(Arrays.asList(annotationsToRemove)).build();
        TaxaListBuilder tlb = new TaxaListBuilder();
        for (Taxon taxon : baseTaxaList) {
            Taxon.Builder tb = new Taxon.Builder(taxon.getName());
            for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
                if (!keepers.contains(entry.getKey())) {
                    tb.addAnno(entry.getKey(), entry.getValue());
                }
            }
            tlb.add(tb.build());
        }
        return tlb.build();
    }

    /**
     * Provides the set of all annotation key found in any of taxa
     *
     * @param baseTaxaList
     *
     * @return
     */
    public static Set allAnnotationKeys(TaxaList baseTaxaList) {
        ImmutableSet.Builder keepers = new ImmutableSet.Builder();
        for (Taxon taxon : baseTaxaList) {
            for (Map.Entry entry : taxon.getAnnotation().getAllAnnotationEntries()) {
                keepers.add(entry.getKey());
            }
        }
        return keepers.build();
    }

    public static void exportAnnotatedTaxaListTable(TaxaList taxa, String filename) {
        try (BufferedWriter writer = Utils.getBufferedWriter(filename)) {
            writer.append("\n");
            TableReportUtils.saveDelimitedTableReport(new TaxaListTableReport(taxa), DELIMITER, writer, true);
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalStateException("TaxaListIOUtils: exportAnnotatedTaxaListTable: problem saving file: " + filename);
        }
    }

    public static TaxaList importAnnotatedTaxaList(String filename) {
        TaxaListBuilder builder = new TaxaListBuilder();
        try (BufferedReader reader = Utils.getBufferedReader(filename)) {
            String header = reader.readLine().trim();
            if (!header.equalsIgnoreCase("")) {
                throw new IllegalArgumentException("TaxaListIOUtils: importAnnotatedTaxaList: This file doesn't start with : " + filename);
            }
            header = reader.readLine().trim();
            String[] columns = header.split(DELIMITER);
            for (int i = 0; i < columns.length; i++) {
                columns[i] = columns[i].trim();
            }
            if (!columns[0].equalsIgnoreCase("Taxa")) {
                throw new IllegalArgumentException("TaxaListIOUtils: importAnnotatedTaxaList: First column should be Taxa: " + filename);
            }
            int numColumns = columns.length;
            int lineNum = 2;
            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                lineNum++;
                String[] annotations = line.split(DELIMITER);
                if (numColumns != annotations.length) {
                    throw new IllegalStateException("TaxaListIOUtils: importAnnotatedTaxaList: number of annotations doesn't match number of columns line: " + lineNum + " taxon: " + annotations[0].trim());
                }
                Taxon.Builder currentTaxon = new Taxon.Builder(annotations[0].trim());
                for (int i = 1; i < numColumns; i++) {
                    String value = annotations[i].trim();
                    if (!value.isEmpty()) {
                        currentTaxon.addAnno(columns[i], value);
                    }
                }
                builder.add(currentTaxon.build());
            }
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalStateException("TaxaListIOUtils: importAnnotatedTaxaList: Problem reading file: " + filename + "\n" + e.getMessage());
        }

        return builder.build();
    }

    /**
     * Returns an annotated TaxaList from a text annotation file in matrix
     * format. This is a tab delimited file. First row in the file with the
     * field {@literal taxaNameField} is the header row.
     * {@literal taxaNameField} indicated the taxon name, all other fields are
     * user defined. The fields become the keys for the taxa annotation.
     * Quantitative fields should be tagged with "#" sign, e.g.
     * {@literal <#INBREEDF>}. Multiple values are supported per key, and
     * additional values can be either described with an additional column or
     * ";" to delimit values with the same key.
     * 

*

* Filters are a map of filters to be applied. Key are the fields, and value * are what are tested for equality. Only taxa rows true for filters are * retained. *

*

{@literal <#InbreedF> }

* {@literal B73 Inbred Goodman282 0.98 ISU;IBMFounder}

* {@literal MO17 Inbred Goodman282 0.98 UMC;IBMFounder}

*

*

* Produces:

* {@literal B73}

* {@literal MO17}

* The standardized keys are described in the * {@link net.maizegenetics.taxa.Taxon}, and these constant fields are all * upper case. * * @param fileName with complete path * @param taxaNameField field name with the taxon name * @param filters Map of filter to determine which rows to retain as the * file is processed. * * @return TaxaList with annotations */ // New version calls readTaxaAnnotationFileAL, then creates TaxaList from ArrayList public static TaxaList readTaxaAnnotationFile(String fileName, String taxaNameField, Map filters, boolean mergeSameNames) { // create list ArrayList taxaAL = readTaxaAnnotationFileAL(fileName, taxaNameField, filters); if (taxaAL == null) return null; TaxaListBuilder tlb = new TaxaListBuilder(); taxaAL.stream().forEach(taxa -> { if (mergeSameNames) { tlb.addOrMerge(taxa); } else { tlb.add(taxa); } }); return tlb.sortTaxaAlphabetically().build(); } // Version of readTaxaAnnotationFile that returns an ArrayList. This is called // from places that wish to allow duplicate taxa. The TaxaList does not allow for // duplicate entries. An ArrayList does. public static ArrayList readTaxaAnnotationFileAL(String fileName, String taxaNameField, Map filters) { try { BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000); fileIn.mark(1 << 16); String line = fileIn.readLine(); ArrayList taxaAL = new ArrayList(); int indexOfName = 0; //parse headers List headers = new ArrayList<>(); List isQuant = new ArrayList<>(); if (line.contains(taxaNameField)) { int i = 0; for (String header : line.split("\\t")) { if (header.equals(taxaNameField)) { indexOfName = i; } isQuant.add(header.startsWith("#") || header.startsWith("<#")); headers.add(header.replace(">", "").replace("<", "").replace("#", "")); i++; } } else { fileIn.reset(); } //parse taxa rows while ((line = fileIn.readLine()) != null) { String[] s = line.split("\\t"); Taxon.Builder anID = new Taxon.Builder(s[indexOfName]); for (int i = 0; i < s.length; i++) { if (i == indexOfName) { continue; } String[] cs = s[i].split(";"); for (String ta : cs) { if (ta == null || ta.isEmpty()) { continue; } if (isQuant.get(i)) { if (ta.equals("NA")) { anID.addAnno(headers.get(i), Double.NaN); } else { anID.addAnno(headers.get(i), Double.parseDouble(ta)); } } else { anID.addAnno(headers.get(i), ta); } } } Taxon t = anID.build(); if (doesTaxonHaveAllAnnotations(t, filters)) { taxaAL.add(t); } } // Sort alphabetically based on name. This is to remain consistent // with the readTaxaAnnotationFile(), which alphabetizes the taxaList Collections.sort(taxaAL, new Comparator() { public int compare(Taxon taxa1, Taxon taxa2) { return taxa1.getName().compareTo(taxa2.getName()); } }); return taxaAL; } catch (Exception e) { System.err.println("Error in Reading Annotated Taxon File:" + fileName); e.printStackTrace(); } return null; } /** * Returns an annotated TaxaList from a text annotation file in matrix * format. This is a tab delimited file. First row in the file with the * field {@literal taxaNameField} is the header row. * {@literal taxaNameField} indicated the taxon name, all other fields are * user defined. The fields become the keys for the taxa annotation. * Quantitative fields should be tagged with "#" sign, e.g. * {@literal <#INBREEDF>}. Multiple values are supported per key, and * additional values can be either described with an additional column or * ";" to delimit values with the same key. *

*

{@literal <#InbreedF> }

* {@literal B73 Inbred Goodman282 0.98 ISU;IBMFounder}

* {@literal MO17 Inbred Goodman282 0.98 UMC;IBMFounder}

*

*

* Produces:

* {@literal B73}

* {@literal MO17}

* The standardized keys are described in the * {@link net.maizegenetics.taxa.Taxon}, and these constant fields are all * upper case. * * @param fileName with complete path * @param taxaNameField field name with the taxon name * * @return TaxaList with annotations */ public static TaxaList readTaxaAnnotationFile(String fileName, String taxaNameField) { return readTaxaAnnotationFile(fileName, taxaNameField, new HashMap(), false); } /** * Tests whether a taxon has annotation values in the map * * @param taxon * @param filters * * @return true if all present, false is otherwise */ public static boolean doesTaxonHaveAllAnnotations(Taxon taxon, Map filters) { SetMultimap taxonAnno = taxon.getAnnotation().getAnnotationAsMap(); boolean keep = true; for (Map.Entry entry : filters.entrySet()) { keep = false; for (String s1 : taxonAnno.get(entry.getKey())) { if (s1.equals(entry.getValue())) { keep = true; } } if (keep == false) { break; } } return keep; } /** * Parses a VCF header with the taxa names and annotations into a multimap. * The taxa name is return as the "ID" key, as used by the VCF format. * * @param s * * @return */ public static SetMultimap parseVCFHeadersIntoMap(String s) { if (s == null) { return null; } if (!(s.startsWith("<") && s.endsWith(">"))) { return null; } String value = s.substring(1, s.length() - 1); ImmutableSetMultimap.Builder im = new ImmutableSetMultimap.Builder() .orderKeysBy(Ordering.natural()).orderValuesBy(Ordering.natural()); for (String s1 : Splitter.on(",").trimResults().split(value)) { String[] ssEntry = s1.split("=", 2); im.put(ssEntry[0], ssEntry[1]); } return im.build(); } /** * This method takes a key file and creates a SortedSet that * contains a set of the tissue values. The set will be null if * no tissues are present * * @param fileName - name of Keyfile containing Tissue header * @param tissueNameField - field name * * @return */ public static List readTissueAnnotationFile(String fileName, String tissueNameField) { try { BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000); fileIn.mark(1 << 16); String line = fileIn.readLine(); List tissues = new ArrayList(); int indexOfTissue = -1; //parse headers if (line.contains(tissueNameField)) { int idx = 0; for (String header : line.split("\\t")) { if (header.equals(tissueNameField)) { indexOfTissue = idx; break; } idx++; } if (indexOfTissue == -1) { // Tissue header not found - return null return null; } } else { fileIn.reset(); } if (indexOfTissue == -1) return null; // Found tissue header, read values - no duplicates, into set while ((line = fileIn.readLine()) != null) { String[] items = line.split("\\t"); for (int idx = 0; idx < items.length; idx++) { if (idx == indexOfTissue) { if (!(tissues.contains(items[idx]))) { tissues.add(items[idx]); } continue; } } } Collections.sort(tissues); return tissues; } catch (Exception e) { System.err.println("Error in Reading Annotated Tissue File:" + fileName); e.printStackTrace(); } return null; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy