org.intermine.bio.dataconversion.HomologeneConverter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bio-source-homologene Show documentation
Show all versions of bio-source-homologene Show documentation
InterMine bio sources modules
The newest version!
package org.intermine.bio.dataconversion;
/*
* Copyright (C) 2002-2022 FlyMine
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. See the LICENSE file for more
* information or http://www.gnu.org/copyleft/lesser.html.
*
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.collections.keyvalue.MultiKey;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.tools.ant.BuildException;
import org.intermine.dataconversion.ItemWriter;
import org.intermine.metadata.Model;
import org.intermine.objectstore.ObjectStoreException;
import org.intermine.util.FormattedTextParser;
import org.intermine.metadata.StringUtil;
import org.intermine.xml.full.Item;
/**
* HomoloGene data converter, to use symbol and organism to identify a gene
*
* @author Fengyuan Hu
*/
public class HomologeneConverter extends BioFileConverter
{
private static final Logger LOG = Logger.getLogger(HomologeneConverter.class);
private IdResolver rslv;
private static final String DATASET_TITLE = "HomoloGene homology predictions";
private static final String DATA_SOURCE_NAME = "HomoloGene";
private static final String PROP_FILE = "homologene_config.properties";
private static final String DEFAULT_IDENTIFIER_TYPE = "primaryIdentifier";
// private static final String DEFAULT_GENEID_TYPE = "symbol";
private Set taxonIds = new HashSet();
private Set homologues = new HashSet();
private static final String ORTHOLOGUE = "orthologue";
private static final String PARALOGUE = "paralogue";
private static final String EVIDENCE_CODE_ABBR = "AA";
private static final String EVIDENCE_CODE_NAME = "Amino acid sequence comparison";
private Properties props = new Properties();
private Map config = new HashMap();
private static String evidenceRefId = null;
private Map identifiersToGenes = new HashMap();
/**
* Constructor
* @param writer the ItemWriter used to handle the resultant items
* @param model the Model
*/
public HomologeneConverter(ItemWriter writer, Model model) {
super(writer, model, DATA_SOURCE_NAME, DATASET_TITLE);
readConfig();
}
/**
* Sets the list of taxonIds that should be processed. All genes will be loaded.
*
* @param taxonIds a space-separated list of taxonIds
*/
public void setHomologeneOrganisms(String taxonIds) {
this.taxonIds = new HashSet(Arrays.asList(StringUtil.split(taxonIds, " ")));
LOG.info("Setting list of organisms to " + taxonIds);
}
/**
* Sets the list of taxonIds of homologues that should be processed. These homologues will only
* be processed if they are homologues for the organisms of interest.
*
* @param homologues a space-separated list of taxonIds
*/
public void setHomologeneHomologues(String homologues) {
this.homologues = new HashSet(Arrays.asList(StringUtil.split(homologues, " ")));
LOG.info("Setting list of homologues to " + homologues);
}
/**
* {@inheritDoc}
*/
@Override
public void process(Reader reader) throws Exception {
/*
homologene.data is a tab delimited file containing the following
columns:
1) HID (HomoloGene group id) - uid, https://www.ncbi.nlm.nih.gov/homologene?term=3[uid]
2) Taxonomy ID
3) Gene ID - NBCI Id
4) Gene Symbol
5) Protein gi
6) Protein accession
*/
setUpResolver();
String previousGroup = null;
Set genes = new HashSet();
if (taxonIds.isEmpty()) {
throw new BuildException("homologene.organisms property not set in project XML file");
}
Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader);
while (lineIter.hasNext()) {
String[] bits = lineIter.next();
if (bits.length < 6) {
continue;
}
String groupId = bits[0];
if (previousGroup != null && !groupId.equals(previousGroup)) {
processHomologues(genes);
genes = new HashSet();
}
previousGroup = groupId;
String taxonId = bits[1];
if (!isValid(taxonId)) {
// not an organism of interest, skip
continue;
}
String ncbiId = bits[2];
String symbol = bits[3];
String gene = getGene(ncbiId, symbol, taxonId);
if (gene == null) {
// invalid gene
continue;
}
genes.add(new GeneRecord(gene, taxonId));
}
}
private void setUpResolver() {
Set allTaxonIds = new HashSet();
allTaxonIds.addAll(taxonIds);
allTaxonIds.addAll(homologues);
if (rslv == null) {
rslv = IdResolverService.getIdResolverByOrganism(allTaxonIds);
}
}
private void readConfig() {
try {
props.load(getClass().getClassLoader().getResourceAsStream(
PROP_FILE));
} catch (IOException e) {
throw new RuntimeException("Problem loading properties '"
+ PROP_FILE + "'", e);
}
for (Map.Entry
© 2015 - 2025 Weber Informatics LLC | Privacy Policy