All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.lib.EtlCommons Maven / Gradle / Ivy

There is a newer version: 6.3.0
Show newest version
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib;

import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.config.Configurator;
import org.opencb.commons.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by fjlopez on 03/06/16.
 */
public class EtlCommons {

    public static final String HOMO_SAPIENS_NAME ="Homo sapiens";

    public static final String GENOME_DATA = "genome";
    public static final String GENE_DATA = "gene";
    public static final String REFSEQ_DATA = "refseq";
    public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association";
    public static final String VARIATION_DATA = "variation";
    public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score";
    public static final String MISSENSE_VARIATION_SCORE_DATA = "missense_variation_functional_score";
    public static final String REGULATION_DATA = "regulation";
    public static final String PROTEIN_DATA = "protein";
    public static final String CONSERVATION_DATA = "conservation";
    public static final String CLINICAL_VARIANTS_DATA = "clinical_variants";
    public static final String SPLICE_SCORE_DATA = "splice_score";

    public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics";
    public static final String PHARMGKB_NAME = "PharmGKB";
    public static final String PHARMGKB_DATA = "pharmgkb";
    public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";

    public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
    public static final String CLINVAR_VERSION = "2024-05";
    public static final String CLINVAR_DATE = "2024-05";
    public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
    public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
    public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
    public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
    public static final String IARCTP53_FILE = "IARC-TP53.zip";
    public static final String GWAS_FILE = "gwas_catalog.tsv";
    public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz";
    @Deprecated
    public static final String DBSNP_FILE = "GCF_000001405.40.gz";
    public static final String DBSNP_NAME = "dbSNP";
    public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json";
    public static final String SNP_COLLECTION_NAME = "snp";

    public static final String STRUCTURAL_VARIANTS_DATA = "svs";
    public static final String REPEATS_DATA = "repeats";
    public static final String OBO_DATA = "ontology";
    public static final String HPO_FILE = "hp.obo";
    public static final String GO_FILE = "go-basic.obo";
    public static final String DOID_FILE = "doid.obo";
    public static final String PFM_DATA = "regulatory_pfm";

    // Build specific data options
    public static final String GENOME_INFO_DATA = "genome_info";
    public static final String DISGENET_DATA = "disgenet";
    public static final String HPO_DATA = "hpo";
    public static final String CADD_DATA = "cadd";
    public static final String PPI_DATA = "ppi";
    public static final String DRUG_DATA = "drug";
    public static final String CLINVAR_DATA = "clinvar";
    public static final String DOCM_DATA = "docm";
    public static final String COSMIC_DATA = "cosmic";
    public static final String GWAS_DATA = "gwas";
    public static final String IARCTP53_GERMLINE_FILE = "germlineMutationDataIARC TP53 Database, R20.txt";
    public static final String IARCTP53_GERMLINE_REFERENCES_FILE = "germlineMutationReferenceIARC TP53 Database, R20.txt";
    public static final String IARCTP53_SOMATIC_FILE = "somaticMutationDataIARC TP53 Database, R20.txt";
    public static final String IARCTP53_SOMATIC_REFERENCES_FILE = "somaticMutationReferenceIARC TP53 Database, R20.txt";
    public static final String HGMD_DATA = "hgmd";

    public static final String PUBMED_DATA = "pubmed";

    // Load specific data options
    public static final String PROTEIN_FUNCTIONAL_PREDICTION_DATA = "protein_functional_prediction";

    // Path and file names
    public static final String GERP_SUBDIRECTORY = "gerp";
    public static final String MMSPLICE_SUBDIRECTORY = "mmsplice";
    public static final String MMSPLICE_VERSION_FILENAME = "mmspliceVersion.json";
    public static final String SPLICEAI_SUBDIRECTORY = "spliceai";
    public static final String SPLICEAI_VERSION_FILENAME = "spliceaiVersion.json";

    // binary bigwig file
    public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw";
    // bigwig file manually transformed to bedGraph file
    public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz";
    public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz";
    public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz";
    public static final String DOCM_FILE = "docm.json.gz";
    public static final String DOCM_NAME = "DOCM";
    public static final String STRUCTURAL_VARIANTS_FOLDER = "structuralVariants";
    public static final String DGV_FILE = "dgv.txt";
    public static final String DGV_VERSION_FILE = "dgvVersion.json";
    public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants";
    public static final String TRF_FILE = "simpleRepeat.txt.gz";
    public static final String TRF_VERSION_FILE = "simpleRepeat.json";
    public static final String GSD_FILE = "genomicSuperDups.txt.gz";
    public static final String GSD_VERSION_FILE = "genomicSuperDups.json";
    public static final String WM_FILE = "windowMasker.txt.gz";
    public static final String WM_VERSION_FILE = "windowMasker.json";
    public static final String REPEATS_FOLDER = "genome";
    public static final String REPEATS_JSON = "repeats";
    public static final String OBO_JSON = "ontology";
    public static final String HPO_VERSION_FILE = "hpoVersion.json";
    public static final String GO_VERSION_FILE = "goVersion.json";
    public static final String DO_VERSION_FILE = "doVersion.json";
    public static final String HGMD_FILE = "hgmd.vcf";
    public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json";

    public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz";
    public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz";

    public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath)
            throws IOException, InterruptedException {
        // This small hack allow to configure the appropriate Logger level from the command line, this is done
        // by setting the DEFAULT_LOG_LEVEL_KEY before the logger object is created.
//        org.apache.log4j.Logger rootLogger = LogManager.getRootLogger();
//        ConsoleAppender stderr = (ConsoleAppender) rootLogger.getAppender("stdout");
//        stderr.setThreshold(Level.toLevel("debug"));

        Configurator.setRootLevel(Level.INFO);

        Logger logger = LoggerFactory.getLogger("EtlCommons");

        ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath);

        logger.debug("Executing command: " + StringUtils.join(builder.command(), " "));
        Process process = builder.start();
        process.waitFor();

        // Check process output
        boolean executedWithoutErrors = true;
        int genomeInfoExitValue = process.exitValue();
        if (genomeInfoExitValue != 0) {
            logger.warn("Error executing {}, error code: {}. More info in log file: {}", binPath, genomeInfoExitValue, logFilePath);
            executedWithoutErrors = false;
        }
        return executedWithoutErrors;
    }

    private static ProcessBuilder getProcessBuilder(File workingDirectory, String binPath, List args, String logFilePath) {
        List commandArgs = new ArrayList<>();
        commandArgs.add(binPath);
        commandArgs.addAll(args);
        ProcessBuilder builder = new ProcessBuilder(commandArgs);

        // working directoy and error and output log outputs
        if (workingDirectory != null) {
            builder.directory(workingDirectory);
        }
        builder.redirectErrorStream(true);
        if (logFilePath != null) {
            builder.redirectOutput(ProcessBuilder.Redirect.appendTo(new File(logFilePath)));
        }

        return builder;
    }

    public static boolean isMissing(String string) {
        return !((string != null) && !string.isEmpty()
                && !string.replace(" ", "")
                .replace("not specified", "")
                .replace("NS", "")
                .replace("NA", "")
                .replace("na", "")
                .replace("NULL", "")
                .replace("null", "")
                .replace("\t", "")
                .replace(".", "")
                .replace("-", "").isEmpty());
    }

    public static Long countFileLines(Path filePath) throws IOException {
        try (BufferedReader bufferedReader1 = FileUtils.newBufferedReader(filePath)) {
            long nLines = 0;
            String line1 = bufferedReader1.readLine();
            while (line1 != null) {
                nLines++;
                line1 = bufferedReader1.readLine();
            }
            return nLines;
        }

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy