pro.parseq.vcf.utils.VcfGrammar Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vcf-explorer Show documentation
Show all versions of vcf-explorer Show documentation
Library for Variant Call Format (VCF) files manipulation
/*******************************************************************************
* Copyright 2016-2017 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*******************************************************************************/
package pro.parseq.vcf.utils;
import java.util.regex.Pattern;
/**
* Class to hold VCF specification properties (see VCFv4.2 specification for more details)
*
* @author Alexander Afanasyev [email protected]
*/
public class VcfGrammar {
// Punctuation
private static final String COMMA_DELIMITER = ",";
private static final String COLON_DELIMITER = ":";
private static final String EQUAL_DELIMITER = "=";
private static final String PIPE_DELIMITER = "\\|";
private static final String SEMICOLON_DELIMITER = ";";
private static final String SLASH_DELIMITER = "\\/|\\|";
private static final String TAB_DELIMITER = "\\t";
// Meta-information line syntax
private static final String FILEFORMAT_PATTERN = "##fileformat=VCFv[\\d]+.[\\d]+";
public static final Pattern fileformatPattern = Pattern.compile(FILEFORMAT_PATTERN);
private static final String METADATA_PATTERN = "##[\\w]+=[^\\n]+";
private static final String METADATA_KEY_PATTERN = "(?<=##)[\\w]+(?==)";
private static final String METADATA_VALUE_PATTERN = "(?<==)[^\\n]+";
public static final Pattern metadataPattern = Pattern.compile(METADATA_PATTERN);
public static final Pattern metadataKeyPattern = Pattern.compile(METADATA_KEY_PATTERN);
public static final Pattern metadataValuePattern = Pattern.compile(METADATA_VALUE_PATTERN);
private static final String FILTER_PATTERN = "##FILTER=";
private static final String FILTER_ID_PATTERN = "(?<=ID=)[\\w]+(?=,)";
private static final String FILTER_DESCRIPTION_PATTERN = "(?<=\")[^\\n]+(?=\")";
public static final Pattern filterPattern = Pattern.compile(FILTER_PATTERN);
public static final Pattern filterIdPattern = Pattern.compile(FILTER_ID_PATTERN);
public static final Pattern filterDescriptionPattern = Pattern.compile(FILTER_DESCRIPTION_PATTERN);
private static final String FORMAT_PATTERN = "##FORMAT=";
private static final String FORMAT_ID_PATTERN = "(?<=ID=)[\\w]+(?=,)";
private static final String FORMAT_NUMBER_PATTERN = "(?<=Number=)[\\.ARG\\d](?=,)";
private static final String FORMAT_TYPE_PATTERN = "(?<=Type=)(Integer|Float|Character|String)(?=,)";
private static final String FORMAT_DESCRIPTION_PATTERN = "(?<=\")[^\\n]+(?=\")";
public static final Pattern formatPattern = Pattern.compile(FORMAT_PATTERN);
public static final Pattern formatIdPattern = Pattern.compile(FORMAT_ID_PATTERN);
public static final Pattern formatNumberPattern = Pattern.compile(FORMAT_NUMBER_PATTERN);
public static final Pattern formatTypePattern = Pattern.compile(FORMAT_TYPE_PATTERN);
public static final Pattern formatDescriptionPattern = Pattern.compile(FORMAT_DESCRIPTION_PATTERN);
private static final String GENOTYPE_PATTERN = "[\\d\\.](\\/|\\|)[\\d\\.]";
private static final String GENOTYPE_FIRST_ALLELE_PATTERN = "[\\d](?=(\\/|\\|))";
private static final String GENOTYPE_UNKNOWN_FIRST_ALLELE_PATTERN = "[\\.](?=(\\/|\\|))";
private static final String GENOTYPE_SECOND_ALLELE_PATTERN = "(?<=(\\/|\\|))[\\d]";
private static final String GENOTYPE_UNKNOWN_SECOND_ALLELE_PATTERN = "(?<=(\\/|\\|))[\\.]";
public static final Pattern genotypeValuePattern = Pattern.compile(GENOTYPE_PATTERN);
public static final Pattern genotypeFirstAllelePattern = Pattern.compile(GENOTYPE_FIRST_ALLELE_PATTERN);
public static final Pattern genotypeSecondAllelePattern = Pattern.compile(GENOTYPE_SECOND_ALLELE_PATTERN);
public static final Pattern genotypeUnknownFirstAllelePattern = Pattern.compile(GENOTYPE_UNKNOWN_FIRST_ALLELE_PATTERN);
public static final Pattern genotypeUnknownSecondAllelePattern = Pattern.compile(GENOTYPE_UNKNOWN_SECOND_ALLELE_PATTERN);
public static final String GENOTYPE_FIELD = "GT";
public static final String GENOTYPE_DELIMITER = String.format("%s|%s", SLASH_DELIMITER, PIPE_DELIMITER);
public static final String REFERENCE_ALLELE = "0";
public static final String VALUE_PER_ALLELE = "A";
public static final String VALUE_PER_ALLELE_WITH_REF = "R";
private static final String INFO_PATTERN = "##INFO=";
private static final String INFO_ID_PATTERN = "(?<=ID=)[\\w]+(?=,)";
private static final String INFO_NUMBER_PATTERN = "(?<=Number=)[\\.ARG\\d](?=,)";
private static final String INFO_TYPE_PATTERN = "(?<=Type=)(Flag|Integer|Float|Character|String)(?=,)";
private static final String INFO_DESCRIPTION_PATTERN = "(?<=Description=\")[^\\n]+(?=\")";
private static final String INFO_SOURCE_PATTERN = "(?<=Source=\")[^\\n\"]+(?=\")";
private static final String INFO_VERSION_PATTERN = "(?<=Version=\")[^\\n]+(?=\")";
public static final Pattern infoPattern = Pattern.compile(INFO_PATTERN);
public static final Pattern infoIdPattern = Pattern.compile(INFO_ID_PATTERN);
public static final Pattern infoNumberPattern = Pattern.compile(INFO_NUMBER_PATTERN);
public static final Pattern infoTypePattern = Pattern.compile(INFO_TYPE_PATTERN);
public static final Pattern infoDescriptionPattern = Pattern.compile(INFO_DESCRIPTION_PATTERN);
public static final Pattern infoSourcePattern = Pattern.compile(INFO_SOURCE_PATTERN);
public static final Pattern infoVersionPattern = Pattern.compile(INFO_VERSION_PATTERN);
private static final String EXACT_COUNT_NUMBER_VALUE_PATTERN = "[\\d]";
public static final Pattern exactCountNumberValuePattern = Pattern.compile(EXACT_COUNT_NUMBER_VALUE_PATTERN);
// Header line syntax
private static final String HEADER_PATTERN = "#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO(\\tFORMAT(\\t[\\S]+)*)?";
public static final Pattern headerPattern = Pattern.compile(HEADER_PATTERN);
public static final int MANDATORY_COLUMNS_NUMBER = 8;
public static final String COLUMN_DELIMITER = TAB_DELIMITER;
public static final int CHROM_COLUMN_IDX = 0;
public static final int POS_COLUMN_IDX = 1;
public static final int ID_COLUMN_IDX = 2;
public static final int REF_COLUMN_IDX = 3;
public static final int ALT_COLUMN_IDX = 4;
public static final int QUAL_COLUMN_IDX = 5;
public static final int FILTER_COLUMN_IDX = 6;
public static final int INFO_COLUMN_IDX = 7;
public static final int FORMAT_COLUMN_IDX = 8;
// Data line syntax
// TODO: proper pattern detalization
private static final String DATA_LINE_PATTERN = "[\\S]+\\t[\\d]+\\t[\\S]+\\t[ATGC]+\\t[ATGC,\\.]+\\t[\\d\\.]+\\t[\\S]+\\t[\\S]+(\\t[\\S]+(\\t[\\S]+)*)?";
public static final Pattern dataLinePattern = Pattern.compile(DATA_LINE_PATTERN);
public static final String FILTER_PASSED = "PASS";
public static final String MISSING_VALUE = ".";
public static final String ALLELE_DELIMITER = COMMA_DELIMITER;
public static final String FILTER_DELIMITER = SEMICOLON_DELIMITER;
public static final String FORMAT_DELIMITER = COLON_DELIMITER;
public static final String ID_DELIMITER = SEMICOLON_DELIMITER;
public static final String INFO_DELIMITER = SEMICOLON_DELIMITER;
public static final String INFO_KEY_VALUE_DELIMITER = EQUAL_DELIMITER;
public static final String INFO_VALUE_DELIMITER = COMMA_DELIMITER;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy