All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder Maven / Gradle / Ivy

There is a newer version: 6.3.0
Show newest version
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.builders;

import org.opencb.biodata.models.core.GenomeSequenceChunk;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;

public class GenomeSequenceFastaBuilder extends CellBaseBuilder {

    private Path genomeReferenceFastaFile;

    private static final int CHUNK_SIZE = 2000;

    public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSerializer serializer) {
        super(serializer);
        this.genomeReferenceFastaFile = genomeReferenceFastaFile;
    }

    @Override
    public void parse() {

        try {
            String sequenceName = null;
            String sequenceType = "";
            String sequenceAssembly = null;
            String line;
            StringBuilder sequenceStringBuilder = new StringBuilder();

            // Preparing input and output files
            BufferedReader br;
            br = FileUtils.newBufferedReader(genomeReferenceFastaFile);

            while ((line = br.readLine()) != null) {

                if (!line.startsWith(">")) {
                    sequenceStringBuilder.append(line);
                } else {
                    // new chromosome, save data
                    if (sequenceStringBuilder.length() > 0) {
                        if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) {
                            System.out.println(sequenceName);
                            serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString());
                        }
                    }

                    // initialize data structures
                    String[] lineParts = line.replace(">", "").split(" ");
                    sequenceName = lineParts[0];
                    // Non ENSEMBL fasta files may not contain this extra info in the sequence header, e.g. the Ebola
                    // virus file: >KM034562v1
                    if (lineParts.length > 1) {
                        sequenceType = lineParts[2].split(":")[0];
                        sequenceAssembly = lineParts[2].split(":")[1];
                    }
                    sequenceStringBuilder.delete(0, sequenceStringBuilder.length());
                }
            }
            // Last chromosome must be processed
            if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) {
                serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString());
            }

            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence)
            throws IOException {
        int chunk = 0;
        int start = 1;
        int end = CHUNK_SIZE - 1;
        String chunkSequence;

        String chunkIdSuffix = CHUNK_SIZE / 1000 + "k";
        GenomeSequenceChunk genomeSequenceChunk;

        if (sequence.length() < CHUNK_SIZE) { //chromosome sequence length can be less than CHUNK_SIZE
            chunkSequence = sequence;
            genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + 0 + "_" + chunkIdSuffix, start,
                    sequence.length() - 1, sequenceType, sequenceAssembly, chunkSequence);
            serializer.serialize(genomeSequenceChunk);
            start += CHUNK_SIZE - 1;
        } else {
            while (start < sequence.length()) {
                if (chunk % 10000 == 0) {
                    System.out.println("Chr:" + chromosome + " chunkId:" + chunk);
                }
                // First chunk of the chromosome
                if (start == 1) {
                    // First chunk contains CHUNK_SIZE-1 nucleotides as index start at position 1 but must end at 1999
                    chunkSequence = sequence.substring(start - 1, CHUNK_SIZE - 1);
                    genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + chunk + "_" + chunkIdSuffix, start,
                            end, sequenceType, sequenceAssembly, chunkSequence);
                    serializer.serialize(genomeSequenceChunk);
                    start += CHUNK_SIZE - 1;

                } else {
                    // Regular chunk
                    if ((start + CHUNK_SIZE) < sequence.length()) {
                        chunkSequence = sequence.substring(start - 1, start + CHUNK_SIZE - 1);
                        genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + chunk + "_" + chunkIdSuffix, start,
                                end, sequenceType, sequenceAssembly, chunkSequence);
                        serializer.serialize(genomeSequenceChunk);
                        start += CHUNK_SIZE;
                    } else {
                        // Last chunk of the chromosome
                        chunkSequence = sequence.substring(start - 1, sequence.length());
                        genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + chunk + "_" + chunkIdSuffix, start,
                                sequence.length(), sequenceType, sequenceAssembly, chunkSequence);
                        serializer.serialize(genomeSequenceChunk);
                        start = sequence.length();
                    }
                }
                end = start + CHUNK_SIZE - 1;
                chunk++;
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy