org.opencb.cellbase.lib.download.GeneDownloadManager Maven / Gradle / Ivy
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.cellbase.lib.download;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.DockerUtils;
import org.opencb.commons.utils.FileUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
public class GeneDownloadManager extends AbstractDownloadManager {
private static final String ENSEMBL_NAME = "ENSEMBL";
private static final String UNIPROT_NAME = "UniProt";
// private static final String GERP_NAME = "GERP++";
// private static final String PHASTCONS_NAME = "PhastCons";
// private static final String PHYLOP_NAME = "PhyloP";
private static final String GENE_EXPRESSION_ATLAS_NAME = "Gene Expression Atlas";
private static final String HPO_NAME = "HPO";
private static final String DISGENET_NAME = "DisGeNET";
private static final String GO_ANNOTATION_NAME = "EBI Gene Ontology Annotation";
private static final String DGIDB_NAME = "DGIdb";
private static final String GNOMAD_NAME = "gnomAD";
private static String dockerImage;
private static final Map GENE_UNIPROT_XREF_FILES;
static {
GENE_UNIPROT_XREF_FILES = new HashMap<>();
GENE_UNIPROT_XREF_FILES.put("Homo sapiens", "HUMAN_9606_idmapping_selected.tab.gz");
GENE_UNIPROT_XREF_FILES.put("Mus musculus", "MOUSE_10090_idmapping_selected.tab.gz");
GENE_UNIPROT_XREF_FILES.put("Rattus norvegicus", "RAT_10116_idmapping_selected.tab.gz");
GENE_UNIPROT_XREF_FILES.put("Danio rerio", "DANRE_7955_idmapping_selected.tab.gz");
GENE_UNIPROT_XREF_FILES.put("Drosophila melanogaster", "DROME_7227_idmapping_selected.tab.gz");
GENE_UNIPROT_XREF_FILES.put("Saccharomyces cerevisiae", "YEAST_559292_idmapping_selected.tab.gz");
};
public GeneDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
super(species, assembly, targetDirectory, configuration);
dockerImage = "opencb/cellbase-builder:" + configuration.getApiVersion();
}
@Override
public List download() throws IOException, InterruptedException {
logger.info("Downloading gene information ...");
Path geneFolder = downloadFolder.resolve("gene");
Files.createDirectories(geneFolder);
Path refseqFolder = downloadFolder.resolve("refseq");
Files.createDirectories(refseqFolder);
List downloadFiles = new ArrayList<>();
downloadFiles.addAll(downloadEnsemblData(geneFolder));
downloadFiles.addAll(downloadRefSeq(refseqFolder));
downloadFiles.add(downloadMane(geneFolder));
downloadFiles.add(downloadLrg(geneFolder));
downloadFiles.add(downloadDrugData(geneFolder));
downloadFiles.addAll(downloadGeneUniprotXref(geneFolder));
downloadFiles.add(downloadGeneExpressionAtlas(geneFolder));
downloadFiles.addAll(downloadGeneDiseaseAnnotation(geneFolder));
downloadFiles.add(downloadGnomadConstraints(geneFolder));
downloadFiles.add(downloadGO(geneFolder));
// runGeneExtraInfo(geneFolder);
return downloadFiles;
}
private List downloadEnsemblData(Path geneFolder) throws IOException, InterruptedException {
logger.info("Downloading gene Ensembl data (gtf, pep, cdna, motifs) ...");
List downloadedUrls = new ArrayList<>(4);
List downloadFiles = new ArrayList<>();
String ensemblHost = ensemblHostUrl + "/" + ensemblRelease;
if (!configuration.getSpecies().getVertebrates().contains(speciesConfiguration)) {
ensemblHost = ensemblHostUrl + "/" + ensemblRelease + "/" + getPhylo(speciesConfiguration);
}
String ensemblCollection = "";
if (configuration.getSpecies().getBacteria().contains(speciesConfiguration)) {
// WARN: assuming there's just one assembly
ensemblCollection = speciesConfiguration.getAssemblies().get(0).getEnsemblCollection() + "/";
}
// Ensembl leaves now several GTF files in the FTP folder, we need to build a more accurate URL
// to download the correct GTF file.
String version = ensemblRelease.split("-")[1];
String url = ensemblHost + "/gtf/" + ensemblCollection + speciesShortName + "/*" + version + ".gtf.gz";
String fileName = geneFolder.resolve(speciesShortName + ".gtf.gz").toString();
downloadFiles.add(downloadFile(url, fileName));
downloadedUrls.add(url);
url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/pep/*.pep.all.fa.gz";
fileName = geneFolder.resolve(speciesShortName + ".pep.all.fa.gz").toString();
downloadFiles.add(downloadFile(url, fileName));
downloadedUrls.add(url);
url = ensemblHost + "/fasta/" + ensemblCollection + speciesShortName + "/cdna/*.cdna.all.fa.gz";
fileName = geneFolder.resolve(speciesShortName + ".cdna.all.fa.gz").toString();
downloadFiles.add(downloadFile(url, fileName));
downloadedUrls.add(url);
saveVersionData(EtlCommons.GENE_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(), downloadedUrls,
geneFolder.resolve("ensemblCoreVersion.json"));
return downloadFiles;
}
private List downloadRefSeq(Path refSeqFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading RefSeq...");
List downloadFiles = new ArrayList<>();
// gtf
String url = configuration.getDownload().getRefSeq().getHost();
saveVersionData(EtlCommons.REFSEQ_DATA, "RefSeq", null, getTimeStamp(), Collections.singletonList(url),
refSeqFolder.resolve("refSeqVersion.json"));
String outputFileName = "refSeq_" + StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName()
+ ".gtf.gz";
logger.info("downloading " + url);
Path outputPath = refSeqFolder.resolve(outputFileName);
downloadFiles.add(downloadFile(url, outputPath.toString()));
// genomic fasta
url = configuration.getDownload().getRefSeqFasta().getHost();
outputFileName = "refSeq_" + StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName()
+ "_genomic.fna.gz";
logger.info("downloading " + url);
outputPath = refSeqFolder.resolve(outputFileName);
saveVersionData(EtlCommons.REFSEQ_DATA, "RefSeq", null, getTimeStamp(),
Collections.singletonList(url), refSeqFolder.resolve("refSeqFastaVersion.json"));
downloadFiles.add(downloadFile(url, outputPath.toString()));
logger.info("Unzipping file: " + outputFileName);
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null);
// protein fasta
url = configuration.getDownload().getRefSeqProteinFasta().getHost();
outputFileName = "refSeq_" + StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName()
+ "_protein.faa.gz";
outputPath = refSeqFolder.resolve(outputFileName);
saveVersionData(EtlCommons.REFSEQ_DATA, "RefSeq", null, getTimeStamp(), Collections.singletonList(url),
refSeqFolder.resolve("refSeqProteinFastaVersion.json"));
downloadFiles.add(downloadFile(url, outputPath.toString()));
// cDNA
url = configuration.getDownload().getRefSeqCdna().getHost();
outputFileName = "refSeq_" + StringUtils.capitalize(speciesShortName) + "." + assemblyConfiguration.getName()
+ "_cdna.fna.gz";
outputPath = refSeqFolder.resolve(outputFileName);
saveVersionData(EtlCommons.REFSEQ_DATA, "RefSeq", null, getTimeStamp(), Collections.singletonList(url),
refSeqFolder.resolve("refSeqCdnaFastaVersion.json"));
downloadFiles.add(downloadFile(url, outputPath.toString()));
return downloadFiles;
}
return null;
}
private DownloadFile downloadMane(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading MANE Select ...");
String url = configuration.getDownload().getManeSelect().getHost();
saveVersionData(EtlCommons.GENE_DATA, "MANE Select", configuration.getDownload().getManeSelect().getVersion(),
getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("maneSelectVersion.json"));
String[] array = url.split("/");
return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
}
return null;
}
private DownloadFile downloadLrg(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading LRG ...");
String url = configuration.getDownload().getLrg().getHost();
saveVersionData(EtlCommons.GENE_DATA, "LRG", configuration.getDownload().getLrg().getVersion(),
getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("lrgVersion.json"));
String[] array = url.split("/");
return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
}
return null;
}
private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading go annotation...");
String url = configuration.getDownload().getGoAnnotation().getHost();
saveVersionData(EtlCommons.GENE_DATA, GO_ANNOTATION_NAME, null, getTimeStamp(), Collections.singletonList(url),
geneFolder.resolve("goAnnotationVersion.json"));
return downloadFile(url, geneFolder.resolve("goa_human.gaf.gz").toString());
}
return null;
}
private DownloadFile downloadGnomadConstraints(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading gnomAD constraints data...");
String url = configuration.getDownload().getGnomadConstraints().getHost();
saveVersionData(EtlCommons.GENE_DATA, GNOMAD_NAME, configuration.getDownload().
getGnomadConstraints().getVersion(), getTimeStamp(),
Collections.singletonList(url), geneFolder.resolve("gnomadVersion.json"));
return downloadFile(url, geneFolder.resolve("gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz").toString());
}
return null;
}
private DownloadFile downloadDrugData(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading drug-gene data...");
String url = configuration.getDownload().getDgidb().getHost();
saveVersionData(EtlCommons.GENE_DATA, DGIDB_NAME, null, getTimeStamp(), Collections.singletonList(url),
geneFolder.resolve("dgidbVersion.json"));
return downloadFile(url, geneFolder.resolve("dgidb.tsv").toString());
}
return null;
}
private String getUniProtReleaseNotesUrl() {
return URI.create(configuration.getDownload().getGeneUniprotXref().getHost()).resolve("../../../").toString()
+ "/relnotes.txt";
}
private String getUniProtRelease(String relnotesFilename) throws IOException {
Path path = Paths.get(relnotesFilename);
FileUtils.checkFile(path);
// The first line at the relnotes.txt file contains the UniProt release
BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset());
String release = reader.readLine().split(" ")[2];
reader.close();
return release;
}
private List downloadGeneUniprotXref(Path geneFolder) throws IOException, InterruptedException {
logger.info("Downloading UniProt ID mapping ...");
List downloadFiles = new ArrayList<>();
if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) {
String geneGtfUrl = configuration.getDownload().getGeneUniprotXref().getHost() + "/"
+ GENE_UNIPROT_XREF_FILES.get(speciesConfiguration.getScientificName());
downloadFiles.add(downloadFile(geneGtfUrl, geneFolder.resolve("idmapping_selected.tab.gz").toString()));
downloadFiles.add(downloadFile(getUniProtReleaseNotesUrl(), geneFolder.resolve("uniprotRelnotes.txt").toString()));
saveVersionData(EtlCommons.GENE_DATA, UNIPROT_NAME,
getUniProtRelease(geneFolder.resolve("uniprotRelnotes.txt").toString()), getTimeStamp(),
Collections.singletonList(geneGtfUrl), geneFolder.resolve("uniprotXrefVersion.json"));
}
return downloadFiles;
}
private DownloadFile downloadGeneExpressionAtlas(Path geneFolder) throws IOException, InterruptedException {
logger.info("Downloading gene expression atlas ...");
String geneGtfUrl = configuration.getDownload().getGeneExpressionAtlas().getHost();
saveVersionData(EtlCommons.GENE_DATA, GENE_EXPRESSION_ATLAS_NAME, getGeneExpressionAtlasVersion(), getTimeStamp(),
Collections.singletonList(geneGtfUrl), geneFolder.resolve("geneExpressionAtlasVersion.json"));
return downloadFile(geneGtfUrl, geneFolder.resolve("allgenes_updown_in_organism_part.tab.gz").toString());
}
private String getGeneExpressionAtlasVersion() {
return FilenameUtils.getBaseName(configuration.getDownload().getGeneExpressionAtlas().getHost())
.split("_")[5].replace(".tab", "");
}
private List downloadGeneDiseaseAnnotation(Path geneFolder) throws IOException, InterruptedException {
logger.info("Downloading gene disease annotation ...");
List downloadFiles = new ArrayList<>();
String host = configuration.getDownload().getHpo().getHost();
String fileName = StringUtils.substringAfterLast(host, "/");
downloadFiles.add(downloadFile(host, geneFolder.resolve(fileName).toString()));
saveVersionData(EtlCommons.GENE_DATA, HPO_NAME, null, getTimeStamp(), Collections.singletonList(host),
geneFolder.resolve("hpoVersion.json"));
host = configuration.getDownload().getDisgenet().getHost();
List files = configuration.getDownload().getDisgenet().getFiles();
for (String file : files) {
String outputFile = file.equalsIgnoreCase("readme.txt") ? "disgenetReadme.txt" : file;
downloadFiles.add(downloadFile(host + "/" + file, geneFolder.resolve(outputFile).toString()));
}
saveVersionData(EtlCommons.GENE_DISEASE_ASSOCIATION_DATA, DISGENET_NAME,
getVersionFromVersionLine(geneFolder.resolve("disgenetReadme.txt"), "(version"), getTimeStamp(),
Collections.singletonList(host), geneFolder.resolve("disgenetVersion.json"));
return downloadFiles;
}
private void runGeneExtraInfo(Path geneFolder) throws IOException, InterruptedException {
// TODO skip if we already have these data
logger.info("Downloading gene extra info ...");
if ("true".equals(System.getenv("CELLBASE_BUILD_DOCKER"))) {
final String outputLog = downloadLogFolder + "/gene_extra_info.log";
EtlCommons.runCommandLineProcess(null, "/opt/cellbase/gene_extra_info.pl",
Arrays.asList("--outdir", geneFolder.toAbsolutePath().toString()),
outputLog);
} else {
AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry(geneFolder.toAbsolutePath().toString(),
"/ensembl-data");
String ensemblScriptParams = "/opt/cellbase/gene_extra_info.pl --outdir /ensembl-data";
DockerUtils.run(dockerImage, null, outputBinding, ensemblScriptParams, null);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy