net.sf.okapi.steps.common.ExtractionVerificationStep Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.common;
import java.io.File;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.exceptions.OkapiBadStepInputException;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filters.IFilterConfigurationMapper;
import net.sf.okapi.common.filterwriter.IFilterWriter;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.StartSubDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Verifies if a {@link RawDocument} is extracted and merged back properly.
* This step performs a first extraction, merges the result without changing
* the data, then re-exact the file generated by the merge, and compare the event
* generated in both extraction. There should be no difference.
* This verification does not verify that the merge file is valid, but it
* should catch most of the problems caused by invalid merges.
*/
@UsingParameters(ExtractionVerificationStepParameters.class) // No parameters
public class ExtractionVerificationStep extends BasePipelineStep {
private final Logger LOGGER = LoggerFactory.getLogger(getClass());
private IFilter filter1, filter2;
private IFilterWriter writer;
private IFilterConfigurationMapper fcMapper;
private String filterConfigId;
private ExtractionVerificationStepParameters params;
ExtractionVerificationUtil verificationUtil;
LocaleId localeId;
/**
* Creates a new ExtractionVerificationStep object. This constructor is
* needed to be able to instantiate an object from newInstance()
*/
public ExtractionVerificationStep() {
params = new ExtractionVerificationStepParameters();
verificationUtil = new ExtractionVerificationUtil();
}
@Override
public void setParameters(IParameters params) {
this.params = (ExtractionVerificationStepParameters) params;
}
@Override
public ExtractionVerificationStepParameters getParameters() {
return params;
}
@StepParameterMapping(parameterType = StepParameterType.FILTER_CONFIGURATION_MAPPER)
public void setFilterConfigurationMapper (IFilterConfigurationMapper fcMapper) {
this.fcMapper = fcMapper;
}
@StepParameterMapping(parameterType = StepParameterType.FILTER_CONFIGURATION_ID)
public void setFilterConfigurationId (String filterConfigId) {
this.filterConfigId = filterConfigId;
}
public String getName () {
return "Extraction Verification";
}
public String getDescription () {
return "Verifies a raw document can be extracted, merged, then extracted again and produces the same set of events during both extractions."
+ " Expects: raw document. Sends back: unmodified raw document.";
}
@Override
protected Event handleRawDocument (Event event) {
if ( !params.getStepEnabled() ) {
LOGGER.info("ExtractionVerificationStep is disabled");
return event;
}
verificationUtil.setCompareSkeleton(params.getCompareSkeleton());
verificationUtil.setTargetLocaleOverriden(false);
Event event1=null;
Event event2=null;
int count1 = 0;
int count2 = 0;
int errorCount = 0;
int limit = params.getLimit();
boolean interrupt = params.getInterrupt();
boolean reachedMax = false;
RawDocument tmpDoc = null;
try (RawDocument initialDoc = event.getRawDocument();) {
if ( Util.isEmpty(filterConfigId) ) {
return event;
}
// Else: Get the filter to use
filter1 = fcMapper.createFilter(filterConfigId);
filter2 = fcMapper.createFilter(filterConfigId);
if (filter1 == null) {
throw new OkapiException("Unsupported filter type.");
}
//=== First extraction
verificationUtil.setTargetLocale(initialDoc.getTargetLocale());
// Open the document
filter1.open(initialDoc);
// Create the filter and write out the document
writer = filter1.createFilterWriter();
// Open the output document
File outFile = File.createTempFile("~okapi-38_okp-vx_", ".tmp");
writer.setOutput(outFile.getAbsolutePath());
writer.setOptions(initialDoc.getSourceLocale(), initialDoc.getEncoding());
while ( filter1.hasNext() ) {
event1 = filter1.next();
writer.handleEvent(event1);
}
writer.close();
filter1.close();
//=== Second pass: Extract from the merged file and compare
tmpDoc = new RawDocument(outFile.toURI(), initialDoc.getEncoding(), initialDoc.getSourceLocale(), initialDoc.getTargetLocale());
filter1 = fcMapper.createFilter(filterConfigId);
filter1.open(initialDoc);
filter2.open(tmpDoc);
boolean hasNext1 = filter1.hasNext();
boolean hasNext2 = filter2.hasNext();
while ( hasNext1 || hasNext2 ) {
if(hasNext1){
count1++;
event1 = filter1.next();
}
if(hasNext2){
count2++;
event2 = filter2.next();
}
if(hasNext1 && hasNext2 && !reachedMax){
// Compare events
if ( !identicalEvent(event1, event2) ) {
errorCount++;
LOGGER.warn("different events");
if(errorCount >= limit && limit > 0){
reachedMax = true;
}
if(reachedMax && interrupt){
throw new OkapiBadStepInputException("Reached maximum verification errors");
}
break;
}
}
hasNext1 = filter1.hasNext();
hasNext2 = filter2.hasNext();
}
// Compare total number of events
if(count1 > count2){
LOGGER.warn("ExtractionVerification: Additional events found in the first run");
}else if(count2 > count1){
LOGGER.warn("ExtractionVerification: Additional events found in the second run");
}
// Compare total number of events
if(errorCount > 0){
LOGGER.warn("ExtractionVerification: {} or more events fail.", errorCount);
}else{
LOGGER.info("ExtractionVerification: All events pass.");
}
outFile.delete();
}
catch ( Throwable e ) {
throw new OkapiException("ExtractionVerification failed.\n" + e.getMessage(), e);
}
finally {
if (tmpDoc != null) {
tmpDoc.close();
}
closeFilterAndWriter();
}
return event; // Return the original document
}
private void closeFilterAndWriter () {
if ( writer != null ) {
writer.close();
writer = null;
}
if ( filter1 != null ) {
filter1.close();
filter1 = null;
}
if ( filter2 != null ) {
filter2.close();
filter2 = null;
}
}
public void destroy () {
closeFilterAndWriter();
}
public void cancel () {
if ( filter1 != null ) filter1.cancel();
if ( filter2 != null ) filter2.cancel();
}
private boolean identicalEvent (Event event1,
Event event2)
{
if (( event1 == null ) && ( event2 != null )) {
LOGGER.warn("Event from first run is null");
return false;
}
if (( event1 != null ) && ( event2 == null )) {
LOGGER.warn("Event from second run is null");
return false;
}
if (( event1 == null ) && ( event2 == null )) {
return true; // They are the same
}
if ( event1.getEventType() != event2.getEventType() ) {
LOGGER.warn("Event Types are different");
return false;
}
if ( event1.getEventType() == EventType.TEXT_UNIT ) {
return verificationUtil.compareTextUnits(event1.getTextUnit(), event2.getTextUnit());
}
else if ( params.getAllEvents() ) {
switch ( event1.getEventType() ) {
case START_DOCUMENT:
StartDocument sd = event1.getStartDocument();
verificationUtil.setMultilingual(sd.isMultilingual());
break;
case START_SUBDOCUMENT:
return verificationUtil.compareStartSubDocument((StartSubDocument)event1.getResource(), (StartSubDocument)event2.getResource());
case START_GROUP:
return verificationUtil.compareBaseReferenceable(event1.getStartGroup(), event2.getStartGroup());
case START_SUBFILTER:
return verificationUtil.compareBaseReferenceable(event1.getStartSubfilter(), event2.getStartSubfilter());
case END_DOCUMENT:
case END_SUBDOCUMENT:
case END_GROUP:
case END_SUBFILTER:
return verificationUtil.compareIResources(event1.getEnding(), event2.getEnding());
case DOCUMENT_PART:
return verificationUtil.compareBaseReferenceable(event1.getDocumentPart(), event2.getDocumentPart());
case CANCELED:
case CUSTOM:
case END_BATCH:
case END_BATCH_ITEM:
case MULTI_EVENT:
case NO_OP:
case PIPELINE_PARAMETERS:
case RAW_DOCUMENT:
case START_BATCH:
case START_BATCH_ITEM:
case TEXT_UNIT:
default:
break;
}
}
return true;
}
}