
uk.ac.ebi.embl.api.validation.EnaValidator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of embl-api-validator Show documentation
Show all versions of embl-api-validator Show documentation
flat file parser and validator
The newest version!
/*******************************************************************************
* Copyright 2012 EMBL-EBI, Hinxton outstation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package uk.ac.ebi.embl.api.validation;
import uk.ac.ebi.embl.agp.reader.AGPFileReader;
import uk.ac.ebi.embl.agp.reader.AGPLineReader;
import uk.ac.ebi.embl.api.entry.Entry;
import uk.ac.ebi.embl.api.validation.check.feature.CdsFeatureTranslationCheck;
import uk.ac.ebi.embl.api.validation.helper.FileUtils;
import uk.ac.ebi.embl.api.validation.helper.FlattenedMessageResult;
import uk.ac.ebi.embl.api.validation.helper.FlattenedValidationPlanResult;
import uk.ac.ebi.embl.api.validation.helper.Utils;
import uk.ac.ebi.embl.api.validation.helper.ValidationMessageComparator;
import uk.ac.ebi.embl.api.validation.plan.EmblEntryValidationPlan;
import uk.ac.ebi.embl.api.validation.plan.EmblEntryValidationPlanProperty;
import uk.ac.ebi.embl.api.validation.plan.GFF3ValidationPlan;
import uk.ac.ebi.embl.api.validation.plan.GenomeAssemblyValidationPlan;
import uk.ac.ebi.embl.api.validation.plan.ValidationPlan;
import uk.ac.ebi.embl.fasta.reader.FastaFileReader;
import uk.ac.ebi.embl.fasta.reader.FastaLineReader;
import uk.ac.ebi.embl.flatfile.reader.FlatFileReader;
import uk.ac.ebi.embl.flatfile.reader.embl.EmblEntryReader;
import uk.ac.ebi.embl.flatfile.reader.genbank.GenbankEntryReader;
import uk.ac.ebi.embl.flatfile.validation.FlatFileValidations;
import uk.ac.ebi.embl.flatfile.writer.EntryWriter;
import uk.ac.ebi.embl.flatfile.writer.WrapType;
import uk.ac.ebi.embl.flatfile.writer.degenerator.DEGenerator;
import uk.ac.ebi.embl.flatfile.writer.embl.EmblEntryWriter;
import uk.ac.ebi.embl.flatfile.writer.genbank.GenbankEntryWriter;
import uk.ac.ebi.embl.gff3.reader.GFF3FlatFileEntryReader;
import org.apache.commons.dbutils.DbUtils;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import java.io.*;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.*;
/**
* Created by IntelliJ IDEA. User: Lawrence Date: 08-Dec-2008 Time: 09:58:38 To
* change this template use File | Settings | File Templates.
*/
public class EnaValidator
{
public static final String help_arg = "-help";
public static final String log_level_arg = "-l";
public static final String remote_arg = "-r";
public static final String fixer_arg = "-fix";
public static final String filter_arg = "-filter";
public static final String fix_diagnose_arg = "-fix_diagnose";
public static final String skip_arg = "-skip";
public static final String low_memory_arg = "-lowmemory";
public static final String write_de_arg = "-de";
public static final String wrap_arg = "-wrap";
public static FileType fileType = FileType.EMBL;
public static final String file_format = "-f";
public static final String prefix_token = "-prefix";
public static final String min_gap_length_token = "-min_gap_length";
public static final String assembly_token = "-assembly";
public static final String transTable_token = "-table";
public static final String version_token="-version";
private static final String fileformatString = "File format(optional) Values:'embl','genbank','gff3','assembly'";
private static final String log_levelString = "Log level(optional) Values : 0(Quiet), 1(Summary), 2(Verbose)";
private static final String remoteString = "Remote, is this being run outside the EBI(optional)";
private static final String fixString = "Fixes entries in input files. Stores input files in 'original_files' folder. (optional)";
private static final String filterString = "-filter Store entries in _good.txt and _bad.txt files in the working directory. Entries with errors are stored in the bad file and entries without errors are stored in the good file. (optional)(default :false)";
private static final String fix_diagnoseString = "Creates 'diagnose' folder in the current directory with original entries in _origin file and the fixed entries in _fixed file. Only fixed entries will be stored in these files.(optional) ";
private static final String skipString = "-skip ,,... Ignore specified errors.(optional)(default:false) ";
private static final String lowmemoryString = "Runs in low memory usage mode. Writes error logs but does not show message summary(optional)";
private static final String wrapString = "Turns on line wrapping in flat file writing (optional) ";
private static final String helpString = "Displays available options";
private static final String prefix_string = "Adds prefix to report files";
private static final String min_gap_length_string = "minimum gap length to generate assembly_gap/gap features, use assembly flag to add assembly_gap features";
private static final String assembly_string = "genome assembly entries";
private static final String version_string ="Displays implementation version of Jar";
protected static final String EMBL_FORMAT = "embl";
protected static final String GENBANK_FORMAT = "genbank";
protected static final String GFF3_FORMAT = "gff3";
protected static final String ASSEMBLY_FORMAT = "assembly";
protected static final String FASTA_FORMAT = "fasta";
public static final int LOG_LEVEL_ALL = 2;
public static final int LOG_LEVEL_QUIET = 0;
public static final int LOG_LEVEL_SUMMARY = 1;
private static final int MESSAGE_FLATTEN_THRESHOLD = 5;
/**
* the number of validation messages stored before we start worrying about
* memory and go into low memory mode
*/
private static final int LOW_MEMORY_THRESHOLD = 1000000;
protected FlatFileReader reader = null;
protected static String prefix;
protected static int log_level = LOG_LEVEL_SUMMARY;// default
protected static boolean remote = false;// default
protected static boolean testMode = false;// default
protected static boolean fixMode = false;// default
protected static boolean fixDiagnoseMode = false;// default
protected static boolean filterMode = false;// default
protected static String filterPrefix = null;// default
protected static boolean lowMemoryMode = false;// default
protected static boolean writeDeMode = false;// default
public static WrapType wrapType = WrapType.NO_WRAP;// default
public static boolean lineCount = true;// default
protected static int min_gap_length = 0;// default
protected static boolean assembly = false;
protected List entryFiles;
private ValidationPlan emblValidator;
private ValidationPlan gff3Validator;
private ValidationPlan gaValidator;
protected boolean parseError;
private int totalEntryCount = 0;
private int fixCount = 0;
private int failCount = 0;
private int unchangedCount = 0;
protected List suppressedErrorCodes = new ArrayList();
protected List parseResults = new ArrayList();
/**
* writers for separating good files and bad files - use needs to be
* specified in the arguments
*/
Writer goodFilesWriter;
Writer badFilesWriter;
/**
* writers for logging all errors, warnings etc
*/
Writer summaryWriter;
Writer infoWriter;
Writer errorWriter;
Writer reportWriter;
Writer fixWriter;
/*
* database connection
*/
protected static Connection con = null;
/*
*
*/
public static void main(String[] args)
{
try
{
EnaValidator enaValidator = new EnaValidator();
enaValidator.init(args,null);
enaValidator.initValidator();
int failedCount =enaValidator.validateFiles();
if(failedCount==0)
System.exit(0);
if(failedCount>0)
System.exit(3);
}
catch (Exception e)
{
e.printStackTrace();
System.exit(1);
}
finally
{
DbUtils.closeQuietly(con);
}
}
/**
* Inits the validator.
*
* @throws SQLException
*
* @throws IOException
* Signals that an I/O exception has occurred.
*/
protected void initValidator() throws SQLException, IOException
{
EmblEntryValidationPlanProperty emblEntryValidationPlanProperty = new EmblEntryValidationPlanProperty();
emblEntryValidationPlanProperty.validationScope.set(ValidationScope.getScope(fileType));
emblEntryValidationPlanProperty.isDevMode.set(testMode);
emblEntryValidationPlanProperty.isFixMode.set(fixMode || fixDiagnoseMode);
emblEntryValidationPlanProperty.minGapLength.set(min_gap_length);
emblEntryValidationPlanProperty.isAssembly.set(assembly);
emblEntryValidationPlanProperty.isRemote.set(remote);
emblEntryValidationPlanProperty.fileType.set(fileType);
emblEntryValidationPlanProperty.enproConnection.set(con);
emblValidator = new EmblEntryValidationPlan(emblEntryValidationPlanProperty);
emblValidator.addMessageBundle(ValidationMessageManager.STANDARD_VALIDATION_BUNDLE);
emblValidator.addMessageBundle(ValidationMessageManager.STANDARD_FIXER_BUNDLE);
gff3Validator = new GFF3ValidationPlan(emblEntryValidationPlanProperty);
gff3Validator.addMessageBundle(ValidationMessageManager.GFF3_VALIDATION_BUNDLE);
gaValidator = new GenomeAssemblyValidationPlan(emblEntryValidationPlanProperty);
gaValidator.addMessageBundle(ValidationMessageManager.GENOMEASSEMBLY_VALIDATION_BUNDLE);
gaValidator.addMessageBundle(ValidationMessageManager.GENOMEASSEMBLY_FIXER_BUNDLE);
initWriters();
}
/**
* Inits the arguments.
* @param args the args
* @param message the message
* @throws SQLException
* @throws IOException
*/
protected void init(String[] args, String message) throws SQLException, IOException
{
Params params = new Params();
JCommander jc = new JCommander(params);
jc.setProgramName("ena_validator ");
try
{
jc.parse(args);
}catch(Exception e)
{
System.err.println("Invalid options");
if (message == null)
{
jc.usage();
writeReturnCodes();
}
else
System.out.println(message);
System.exit(2);
}
if (args.length == 0 || (args.length == 1 && params.help))
{
if (message == null)
{
jc.usage();
writeReturnCodes();
}
else
System.out.println(message);
System.exit(2);
}
if(params.version)
{
System.out.println(this.getClass().getPackage().getImplementationVersion());
System.exit(0);
}
if (params.filenames.isEmpty())
{
System.err.println("Please give the filenames (or) directory with files to validate");
jc.usage();
}
fileType = FileType.get(params.fileFormat);
prefix = params.prefixString;
log_level = params.log_level;
remote = params.remote;
lowMemoryMode = params.lowmemory;
min_gap_length = params.min_gap_length;
assembly = params.assembly;
if (params.wrap)
{
wrapType = WrapType.EMBL_WRAP;
}
if (params.skip != null)
{
String suppressString = params.skip;
suppressString = suppressString.replaceAll( "\\(","");
suppressString = suppressString.replaceAll( "\\)","");
String[] suppressArray = suppressString.split(",");
suppressedErrorCodes = new ArrayList(Arrays.asList(suppressArray));
}
fixMode = params.fix;
writeDeMode = params.fixDe;
fixDiagnoseMode = params.fix_diagnose;
if (params.filter != null)
{
filterMode = true;
filterPrefix = params.filter;
}
List fileStrings = params.filenames;
entryFiles = new ArrayList();
for (String fileString : fileStrings)
{
File fileHandle = new File(fileString);
if (!fileHandle.exists())
{
printMessageLine( "File " + fileHandle.getPath() + " does not exist - exiting",LOG_LEVEL_QUIET);
return;
}
if (fileHandle.isDirectory())
{
printMessageLine( "Directory found : " + fileHandle.getPath(),LOG_LEVEL_ALL);
entryFiles.addAll(Arrays.asList(fileHandle.listFiles()));
}
else
{
printMessageLine( "File found : " + fileHandle.getPath(),LOG_LEVEL_ALL);
entryFiles.add(fileHandle);
}
}
File formatFile=entryFiles.get(0);
FileType fileFormat=FileUtils.getFileType(formatFile);
if(fileFormat!=null)
fileType=fileFormat;
}
/**
* Validate files.
* @throws ValidationEngineException
*/
private int validateFiles()
{
List planResults = new ArrayList();
int parseErrorCount = 0;
try
{
long timeIn = System.currentTimeMillis();
if (filterMode && filterPrefix != null)
{
goodFilesWriter = new PrintWriter(filterPrefix + "_good.txt","UTF-8");
badFilesWriter = new PrintWriter(filterPrefix + "_bad.txt","UTF-8");
}
for (File file : entryFiles)
{
List results = validateFile( file,errorWriter);
planResults.addAll(results);
}
infoWriter.flush();
errorWriter.flush();
reportWriter.flush();
fixWriter.flush();
infoWriter.close();
errorWriter.close();
reportWriter.close();
fixWriter.close();
if (filterMode && filterPrefix != null)
{
badFilesWriter.flush();
badFilesWriter.close();
goodFilesWriter.flush();
goodFilesWriter.close();
}
List> messages = new ArrayList>();
for (ValidationPlanResult planResult : planResults)
{
messages.addAll(planResult.getMessages());
}
for (ValidationResult parseResult : parseResults)
{
messages.addAll(parseResult.getMessages());
for (ValidationMessage message : parseResult.getMessages())
{
parseErrorCount++;
}
}
/**
* will be built up to form the summary for the whole run
*/
String summaryLine = "";
if (!planResults.isEmpty())
{
FlattenedMessageResult results = Utils.flattenMessages(messages,MESSAGE_FLATTEN_THRESHOLD);
List flattenedMessages = results.getFlattenedMessages();
List unFlattenedMessages = results.getUnFlattenedMessages();
Collections.sort( flattenedMessages,new ValidationMessageComparator());
Collections.sort( unFlattenedMessages,new ValidationMessageComparator());
if (!flattenedMessages.isEmpty())
{
summaryLine = summaryLine.concat("\n\n***MESSAGES SUMMARY***");
summaryLine = summaryLine.concat("\nCompressed messages (occurring more than "+ MESSAGE_FLATTEN_THRESHOLD + " times)");
for (ValidationMessage message : flattenedMessages)
{
summaryLine = summaryLine.concat("\n"+ message.getSeverity());
summaryLine = summaryLine.concat(": ");
summaryLine = summaryLine.concat(message.getMessage());
summaryLine = summaryLine.concat(" ("+ message.getMessageKey() + ") ");
}
}
if (!unFlattenedMessages.isEmpty())
{
summaryLine = summaryLine.concat("\n\nMessages");
for (ValidationMessage message : unFlattenedMessages)
{
summaryLine = summaryLine.concat("\n"+ message.getSeverity());
summaryLine = summaryLine.concat(": ");
summaryLine = summaryLine.concat(message.getMessage());
summaryLine = summaryLine.concat(" ("+ message.getMessageKey() + ") ");
for (Object origin : message.getOrigins())
{
StringWriter writer = new StringWriter();
String text = ((Origin) origin).getOriginText();
writer.write(text);
summaryLine = summaryLine.concat(writer.toString());
writer.close();
}
}
}
summaryLine = summaryLine.concat("\n\n***FILE SUMMARY***\n");
List flattenedPlanResults = Utils.flattenValidationPlans(planResults);
for (FlattenedValidationPlanResult flattenedResult : flattenedPlanResults)
{
summaryLine = summaryLine.concat(flattenedResult.getFileName() + " - ");
summaryLine = summaryLine.concat(flattenedResult.getEntryCount() + " entries, ");
summaryLine = summaryLine.concat(flattenedResult.getFailedEntryCount() + " failed entries, ");
summaryLine = summaryLine.concat((flattenedResult.getErrorCount() + parseErrorCount) + " errors, ");
summaryLine = summaryLine.concat(flattenedResult.getFixCount() + " fixes, ");
summaryLine = summaryLine.concat(flattenedResult.getWarningInfoCount() + " warnings & info");
summaryLine = summaryLine.concat("\n");
}
}
summaryLine = summaryLine.concat("\n*** SUMMARY***\n");
summaryLine = summaryLine.concat("Parsing error:" + parseErrorCount+ "\n");
summaryLine = summaryLine.concat("Fixed Entries:" + fixCount + "\n");
summaryLine = summaryLine.concat("Failed Entries:" + failCount+ "\n");
summaryLine = summaryLine.concat("Checked Entries:"+ totalEntryCount + "\n");
summaryLine = summaryLine.concat("Unchanged Entries:"+ unchangedCount + "\n");
long timeOut = System.currentTimeMillis();
long timeToRun = (timeOut - timeIn) / 1000;
summaryLine = summaryLine.concat("\n\nProcessed " + totalEntryCount+ " entries in " + timeToRun + " seconds.\n\n");
printMessage( summaryLine,LOG_LEVEL_SUMMARY);
summaryWriter.write(summaryLine);
summaryWriter.flush();
summaryWriter.close();
}
catch (IOException e)
{
e.printStackTrace();
}
return failCount;
}
/**
* separate method to instantiate so unit tests can call this
*
* @throws IOException
*/
protected void initWriters() throws IOException
{
String summarywriter = prefix == null ? "VAL_SUMMARY.txt" : prefix + "_" + "VAL_SUMMARY.txt";
String infowriter = prefix == null ? "VAL_INFO.txt" : prefix + "_" + "VAL_INFO.txt";
String errorwriter = prefix == null ? "VAL_ERROR.txt" : prefix + "_" + "VAL_ERROR.txt";
String reportswriter = prefix == null ? "VAL_REPORTS.txt" : prefix + "_" + "VAL_REPORTS.txt";
String fixwriter = prefix == null ? "VAL_FIXES.txt" : prefix + "_" + "VAL_FIXES.txt";
summaryWriter = new PrintWriter(summarywriter,"UTF-8");
infoWriter = new PrintWriter(infowriter,"UTF-8");
errorWriter = new PrintWriter(errorwriter,"UTF-8");
reportWriter = new PrintWriter(reportswriter,"UTF-8");
fixWriter = new PrintWriter(fixwriter,"UTF-8");
}
/**
* Validate file.
*
* @param file
* the file
* @param writer
* the writer
* @return the list of ValidationPlanResult
* @throws IOException
* @throws ValidationEngineException
*/
private List validateFile(File file, Writer writer) throws IOException
{
List messages = new ArrayList();
ArrayList
© 2015 - 2025 Weber Informatics LLC | Privacy Policy