All Downloads are FREE. Search and download functionalities are using the official Maven repository.

pro.parseq.vcf.utils.VcfGrammar Maven / Gradle / Ivy

There is a newer version: 1.1.1-RELEASE
Show newest version
/*******************************************************************************
 *     Copyright 2016-2017 the original author or authors.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 *******************************************************************************/
package pro.parseq.vcf.utils;

import java.util.regex.Pattern;

/**
 * Class to hold VCF specification properties (see VCFv4.2 specification for more details)
 * 
 * @author Alexander Afanasyev [email protected]
 */
public class VcfGrammar {

	// Punctuation
	private static final String COMMA_DELIMITER = ",";
	private static final String COLON_DELIMITER = ":";
	private static final String EQUAL_DELIMITER = "=";
	private static final String PIPE_DELIMITER = "\\|";
	private static final String SEMICOLON_DELIMITER = ";";
	private static final String SLASH_DELIMITER = "\\/|\\|";
	private static final String TAB_DELIMITER = "\\t";

	// Meta-information line syntax
	private static final String FILEFORMAT_PATTERN = "##fileformat=VCFv[\\d]+.[\\d]+";
	public static final Pattern fileformatPattern = Pattern.compile(FILEFORMAT_PATTERN);

	private static final String METADATA_PATTERN = "##[\\w\\.]+=[^\\n]+";
	private static final String METADATA_KEY_PATTERN = "(?<=##)[\\w]+(?==)";
	private static final String METADATA_VALUE_PATTERN = "(?<==)[^\\n]+";
	public static final Pattern metadataPattern = Pattern.compile(METADATA_PATTERN);
	public static final Pattern metadataKeyPattern = Pattern.compile(METADATA_KEY_PATTERN);
	public static final Pattern metadataValuePattern = Pattern.compile(METADATA_VALUE_PATTERN);

	private static final String FILTER_PATTERN = "##FILTER=";
	private static final String FILTER_ID_PATTERN = "(?<=ID=)[\\w\\.]+(?=,)";
	private static final String FILTER_DESCRIPTION_PATTERN = "(?<=\")[^\\n]+(?=\")";
	public static final Pattern filterPattern = Pattern.compile(FILTER_PATTERN);
	public static final Pattern filterIdPattern = Pattern.compile(FILTER_ID_PATTERN);
	public static final Pattern filterDescriptionPattern = Pattern.compile(FILTER_DESCRIPTION_PATTERN);

	private static final String FORMAT_PATTERN = "##FORMAT=";
	private static final String FORMAT_ID_PATTERN = "(?<=ID=)[\\w\\.]+(?=,)";
	private static final String FORMAT_NUMBER_PATTERN = "(?<=Number=)[\\.ARG\\d](?=,)";
	private static final String FORMAT_TYPE_PATTERN = "(?<=Type=)(Integer|Float|Character|String)(?=,)";
	private static final String FORMAT_DESCRIPTION_PATTERN = "(?<=\")[^\\n]+(?=\")";
	public static final Pattern formatPattern = Pattern.compile(FORMAT_PATTERN);
	public static final Pattern formatIdPattern = Pattern.compile(FORMAT_ID_PATTERN);
	public static final Pattern formatNumberPattern = Pattern.compile(FORMAT_NUMBER_PATTERN);
	public static final Pattern formatTypePattern = Pattern.compile(FORMAT_TYPE_PATTERN);
	public static final Pattern formatDescriptionPattern = Pattern.compile(FORMAT_DESCRIPTION_PATTERN);

	private static final String GENOTYPE_PATTERN = "[\\d\\.](\\/|\\|)[\\d\\.]";
	private static final String GENOTYPE_FIRST_ALLELE_PATTERN = "[\\d](?=(\\/|\\|))";
	private static final String GENOTYPE_UNKNOWN_FIRST_ALLELE_PATTERN = "[\\.](?=(\\/|\\|))";
	private static final String GENOTYPE_SECOND_ALLELE_PATTERN = "(?<=(\\/|\\|))[\\d]";
	private static final String GENOTYPE_UNKNOWN_SECOND_ALLELE_PATTERN = "(?<=(\\/|\\|))[\\.]";
	public static final Pattern genotypeValuePattern = Pattern.compile(GENOTYPE_PATTERN);
	public static final Pattern genotypeFirstAllelePattern = Pattern.compile(GENOTYPE_FIRST_ALLELE_PATTERN);
	public static final Pattern genotypeSecondAllelePattern = Pattern.compile(GENOTYPE_SECOND_ALLELE_PATTERN);
	public static final Pattern genotypeUnknownFirstAllelePattern = Pattern.compile(GENOTYPE_UNKNOWN_FIRST_ALLELE_PATTERN);
	public static final Pattern genotypeUnknownSecondAllelePattern = Pattern.compile(GENOTYPE_UNKNOWN_SECOND_ALLELE_PATTERN);

	public static final String GENOTYPE_FIELD = "GT";
	public static final String GENOTYPE_DELIMITER = String.format("%s|%s", SLASH_DELIMITER, PIPE_DELIMITER);
	public static final String REFERENCE_ALLELE = "0";

	public static final String VALUE_PER_ALLELE = "A";
	public static final String VALUE_PER_ALLELE_WITH_REF = "R";
	public static final String VALUE_PER_GENOTYPE = "G";
	public static final String UNBOUNDED_VALUE = ".";

	private static final String INFO_PATTERN = "##INFO=";
	private static final String INFO_ID_PATTERN = "(?<=ID=)[\\w\\.]+(?=,)";
	private static final String INFO_NUMBER_PATTERN = "(?<=Number=)[\\.ARG\\d](?=,)";
	private static final String INFO_TYPE_PATTERN = "(?<=Type=)(Flag|Integer|Float|Character|String)(?=,)";
	private static final String INFO_DESCRIPTION_PATTERN = "(?<=Description=\")[^\\n]+(?=\")";
	private static final String INFO_SOURCE_PATTERN = "(?<=Source=\")[^\\n\"]+(?=\")";
	private static final String INFO_VERSION_PATTERN = "(?<=Version=\")[^\\n]+(?=\")";
	public static final Pattern infoPattern = Pattern.compile(INFO_PATTERN);
	public static final Pattern infoIdPattern = Pattern.compile(INFO_ID_PATTERN);
	public static final Pattern infoNumberPattern = Pattern.compile(INFO_NUMBER_PATTERN);
	public static final Pattern infoTypePattern = Pattern.compile(INFO_TYPE_PATTERN);
	public static final Pattern infoDescriptionPattern = Pattern.compile(INFO_DESCRIPTION_PATTERN);
	public static final Pattern infoSourcePattern = Pattern.compile(INFO_SOURCE_PATTERN);
	public static final Pattern infoVersionPattern = Pattern.compile(INFO_VERSION_PATTERN);

	private static final String EXACT_COUNT_NUMBER_VALUE_PATTERN = "[\\d]";
	public static final Pattern exactCountNumberValuePattern = Pattern.compile(EXACT_COUNT_NUMBER_VALUE_PATTERN);

	// Header line syntax
	private static final String HEADER_PATTERN = "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO(\\tFORMAT(\\t[\\S]+)*)?";
	public static final Pattern headerPattern = Pattern.compile(HEADER_PATTERN);

	public static final int MANDATORY_COLUMNS_NUMBER = 8;
	public static final String COLUMN_DELIMITER = TAB_DELIMITER;

	public static final int CHROM_COLUMN_IDX = 0;
	public static final int POS_COLUMN_IDX = 1;
	public static final int ID_COLUMN_IDX = 2;
	public static final int REF_COLUMN_IDX = 3;
	public static final int ALT_COLUMN_IDX = 4;
	public static final int QUAL_COLUMN_IDX = 5;
	public static final int FILTER_COLUMN_IDX = 6;
	public static final int INFO_COLUMN_IDX = 7;
	public static final int FORMAT_COLUMN_IDX = 8;

	// Data line syntax
	// TODO: proper pattern detalization
	private static final String DATA_LINE_PATTERN = "[\\S]+\\t[\\d]+\\t[\\S]+\\t[ATGCN]+\\t[ATGCN,\\.]+\\t[\\d\\.]+\\t[\\S]+\\t[\\S]+(\\t[\\S]+(\\t[\\S]+)*)?";
	public static final Pattern dataLinePattern = Pattern.compile(DATA_LINE_PATTERN);

	public static final String FILTER_PASSED = "PASS";
	public static final String MISSING_VALUE = ".";

	public static final String ALLELE_DELIMITER = COMMA_DELIMITER;
	public static final String FILTER_DELIMITER = SEMICOLON_DELIMITER;
	public static final String FORMAT_DELIMITER = COLON_DELIMITER;
	public static final String ID_DELIMITER = SEMICOLON_DELIMITER;
	public static final String INFO_DELIMITER = SEMICOLON_DELIMITER;
	public static final String INFO_KEY_VALUE_DELIMITER = EQUAL_DELIMITER;
	public static final String INFO_VALUE_DELIMITER = COMMA_DELIMITER;
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy