com.novartis.opensource.yada.io.VCFHelper Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2016 Novartis Institutes for BioMedical Research Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.novartis.opensource.yada.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
/**
* See VCF specification here: 1000Genomes
* @author David Varon
*
*/
public class VCFHelper extends TabHelper{
/**
* Local logger handle
*/
private static Logger l = Logger.getLogger(VCFHelper.class);
/**
* Constant equal to: {@code "##(.*)=(.*)"}
*/
protected final static Pattern HEADER_RX = Pattern.compile("##(.*)=(.*)");
/**
* Constant equal to: {@code "#CHROM\\s.*"}
*/
protected final static Pattern COL_HEAD_RX = Pattern.compile("#CHROM\\s.*");
/**
* Constant equal to: {@code "##(INFO|FILTER|FORMAT|ALT)=<((ID|Number|Type|Description)=(\"?.*\"?))+>"}
*/
protected final static Pattern H_FIELDS_RX = Pattern.compile("##(INFO|FILTER|FORMAT|ALT)=<((ID|Number|Type|Description)=(\"?.*\"?))+>");
/**
* Constant equal to: {@value}
*/
protected final static String FILE_FORMAT = "fileFormat";
/**
* Constant equal to: {@value}
*/
protected final static String DESCRIPTION = "Description";
/**
* Constant equal to: {@value}
*/
protected final static String CHROM = "CHROM";
/**
* Constant equal to: {@value}
*/
protected final static String POS = "POS";
/**
* Constant equal to: {@value}
*/
protected final static String ID = "ID";
/**
* Constant equal to: {@value}
*/
protected final static String REF = "REF";
/**
* Constant equal to: {@value}
*/
protected final static String ALT = "ALT";
/**
* Constant equal to: {@value}
*/
protected final static String QUAL = "QUAL";
/**
* Constant equal to: {@value}
*/
protected final static String FILTER = "FILTER";
/**
* Constant equal to: {@value}
*/
protected final static String INFO = "INFO";
/**
* Constant equal to: {@value}
*/
protected final static String FORMAT = "FORMAT";
/**
* Constant equal to: {@value}
*/
protected final static String DOT = ".";
/**
* Processes fileheader an column headers in VCF files
* @see com.novartis.opensource.yada.io.TabHelper#setHeaders()
*/
@Override
protected void setHeaders() throws YADAIOException
{
String line = "";
StringBuffer fh = new StringBuffer();
boolean areHeadersSet = false;
try
{
while(!areHeadersSet && (line = ((BufferedReader)this.reader).readLine()) != null)
{
Matcher m = COL_HEAD_RX.matcher(line);
if(m.matches())
{
areHeadersSet = true;
setColumnHeader(line);
setFileHeader(fh.toString());
}
else
{
fh.append(line);
fh.append(NEWLINE);
}
}
setColHeaderArray();
setFileHeaderMap();
}
catch(IOException e)
{
throw new YADAIOException(e.getMessage(),e);
}
}
/**
* The VCF file header spec is well defined, and is handled here.
* See VCF Spec
*
* @see com.novartis.opensource.yada.io.FileHelper#setFileHeaderMap()
*/
@Override
protected void setFileHeaderMap()
{
l.info("Setting VCF file header...");
try(Scanner s = new Scanner(getFileHeader()))
{
String line = "";
if (null == this.fileHeaderMap)
{
this.fileHeaderMap = new HashMap<>();
}
try
{
while(s.hasNextLine())
{
line = s.nextLine();
Matcher m = HEADER_RX.matcher(line);
if(m.matches())
{
// it's a file header line
String key = m.group(1);
String val = m.group(2);
Matcher m_xml = H_FIELDS_RX.matcher(line);
if (m_xml.matches())
{
// it's an INFO/FILTER/FORMAT/ALT line
String xmlVal = m_xml.group(2);
// ##INFO=
// Possible Types for INFO fields are: Integer, Float, Flag, Character, and String.
// ##FILTER=
// ##FORMAT=
// handle the description first, because it's a quoted string which can contain commas
String[] attribsDesc = xmlVal.split(DESCRIPTION+"=");
this.fileHeaderMap.put(key+"_"+DESCRIPTION,attribsDesc[1]);
// get the attribs in an array
String[] attribs = attribsDesc[0].split(",");
for(String attr : attribs)
{
// get each pair
String[] pair = attr.split("=");
// put the pair in the file header map, i.e.,
// INFO_ID, val; FILTER_Description, val; etc
this.fileHeaderMap.put(key+"_"+pair[0],pair[1]);
}
}
else
{
this.fileHeaderMap.put(key, val);
}
}
}
}
catch(NoSuchElementException e)
{
e.printStackTrace();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy