All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.lib.download.ProteinDownloadManager Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.download;

import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ProteinDownloadManager extends AbstractDownloadManager {

    private static final String UNIPROT_NAME = "UniProt";

    public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
            throws IOException, CellBaseException {
        super(species, assembly, targetDirectory, configuration);
    }

    /**
     * This method downloads UniProt, IntAct and Interpro data from EMBL-EBI.
     *
     * @return list of files downloaded
     * @throws IOException if there is an error writing to a file
     * @throws InterruptedException if there is an error downloading files     *
     */
    public List download() throws IOException, InterruptedException {
        if (!speciesHasInfoToDownload(speciesConfiguration, "protein")) {
            return null;
        }
        logger.info("Downloading protein information ...");
        Path proteinFolder = downloadFolder.resolve("protein");
        Files.createDirectories(proteinFolder);
        List downloadFiles = new ArrayList<>();

        String url = configuration.getDownload().getUniprot().getHost();
        downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString()));
        Files.createDirectories(proteinFolder.resolve("uniprot_chunks"));
        splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks"));

        String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost();
        downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString()));

        saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1),
                getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json"));

        return downloadFiles;

//        url = configuration.getDownload().getIntact().getHost();
//        downloadFile(url, proteinFolder.resolve("intact.txt").toString());
//        saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url),
//                proteinFolder.resolve("intactVersion.json"));
//
//        url = configuration.getDownload().getInterpro().getHost();
//        downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString());
//        relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost();
//        downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString());
//        saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5),
//                getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json"));
    }

    private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException {
        BufferedReader br = FileUtils.newBufferedReader(uniprotFilePath);
        PrintWriter pw = null;
        StringBuilder header = new StringBuilder();
        boolean beforeEntry = true;
        boolean inEntry = false;
        int count = 0;
        int chunk = 0;
        String line;
        while ((line = br.readLine()) != null) {
            if (line.trim().startsWith("")) {
                inEntry = false;
                if (count % 10000 == 0) {
                    pw.print("");
                    pw.close();
                    chunk++;
                }
            }
        }
        pw.print("");
        pw.close();
        br.close();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy