org.jwat.tools.tasks.test.TestTask Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-tools Show documentation
Show all versions of jwat-tools Show documentation
JWAT-Tools uses the available JWAT libraries to make high level tasks available either from command-line or programmatically.
Common tasks include: Test, Compress, Decompress, CDX, Arc2Warc.
More specialised tasks include: Changed, ContainerMD, Delete, Extract, Interval, PathIndex, Unpack, Headers2CDX.
package org.jwat.tools.tasks.test;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import org.jwat.archive.Cloner;
import org.jwat.archive.FileIdent;
import org.jwat.common.Diagnosis;
import org.jwat.common.DiagnosisType;
import org.jwat.common.UriProfile;
import org.jwat.tools.JWATTools;
import org.jwat.tools.core.CommandLine;
import org.jwat.tools.core.SynchronizedOutput;
import org.jwat.tools.core.ValidatorPlugin;
import org.jwat.tools.tasks.ProcessTask;
import org.jwat.tools.validators.XmlValidatorPlugin;
public class TestTask extends ProcessTask {
public static final String commandName = "test";
public static final String commandDescription = "test validity of ARC/WARC/GZip file(s)";
public TestTask() {
}
@Override
public void show_help() {
System.out.println("jwattools test [-beilx] [-w THREADS] [-a] ...");
System.out.println("");
System.out.println("test one or more ARC/WARC/GZip files");
System.out.println("");
System.out.println("options:");
System.out.println("");
System.out.println(" -a only test files with last-modified after ");
System.out.println(" -b tag/rename files with errors/warnings (*.bad)");
System.out.println(" -e show errors");
System.out.println(" -i --ignore-digest skip digest calculation and validation");
System.out.println(" -l relaxed URL URI validation");
System.out.println(" -x to validate text/xml payload (eg. mets)");
System.out.println(" -w set the amount of worker thread(s) (defaults to 1)");
}
/*
* Summary.
*/
private int arcGzFiles = 0;
private int warcGzFiles = 0;
private int gzFiles = 0;
private int arcFiles = 0;
private int warcFiles = 0;
private int errors = 0;
private int warnings = 0;
private int runtimeErrors = 0;
private int skipped = 0;
/*
* Settings.
*/
private boolean bShowErrors = false;
private boolean bValidateDigest = true;
private Long after = 0L;
private boolean bBad = false;
private List validatorPlugins = new LinkedList();
private UriProfile uriProfile = UriProfile.RFC3986;
private int recordHeaderMaxSize = 1024 * 1024;
private int payloadHeaderMaxSize = 1024 * 1024;
private Cloner cloner;
/*
* State.
*/
/** Valid results output stream. */
private SynchronizedOutput validOutput;
/** Invalid results output stream. */
private SynchronizedOutput invalidOutput;
/** Exception output stream. */
private SynchronizedOutput exceptionsOutput;
@Override
public void command(CommandLine.Arguments arguments) {
CommandLine.Argument argument;
// Thread workers.
argument = arguments.idMap.get( JWATTools.A_WORKERS );
if ( argument != null && argument.value != null ) {
try {
threads = Integer.parseInt(argument.value);
} catch (NumberFormatException e) {
System.out.println( "Invalid number of threads requested: " + argument.value );
System.exit( 1 );
}
}
if ( threads < 1 ) {
System.out.println( "Invalid number of threads requested: " + threads );
System.exit( 1 );
}
// Show errors.
if ( arguments.idMap.containsKey( JWATTools.A_SHOW_ERRORS ) ) {
bShowErrors = true;
}
System.out.println("Showing errors: " + bShowErrors);
// Ignore digest.
if ( arguments.idMap.containsKey( JWATTools.A_IGNORE_DIGEST ) ) {
bValidateDigest = false;
}
System.out.println("Validate digest: " + bValidateDigest);
// Relaxed URI validation.
if ( arguments.idMap.containsKey( JWATTools.A_LAX ) ) {
uriProfile = UriProfile.RFC3986_ABS_16BIT_LAX;
System.out.println("Using relaxed URI validation for ARC URL and WARC Target-URI.");
}
// XML validation.
if ( arguments.idMap.containsKey( JWATTools.A_XML ) ) {
validatorPlugins.add(new XmlValidatorPlugin());
}
// Tag.
if ( arguments.idMap.containsKey( JWATTools.A_BAD ) ) {
bBad = true;
System.out.println("Tagging enabled for invalid files");
}
// After.
argument = arguments.idMap.get( JWATTools.A_AFTER );
if ( argument != null && argument.value != null ) {
try {
DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
dateFormat.setLenient(false);
Date afterDate = dateFormat.parse(argument.value);
after = afterDate.getTime();
} catch (ParseException e) {
System.out.println("Invalid date format - " + argument.value);
}
}
// TODO optional
//cloner = Cloner.getCloner();
// Files.
argument = arguments.idMap.get( JWATTools.A_FILES );
List filesList = argument.values;
validOutput = new SynchronizedOutput("v.out");
invalidOutput = new SynchronizedOutput("i.out");
exceptionsOutput = new SynchronizedOutput("e.out");
ResultThread resultThread = new ResultThread();
Thread thread = new Thread(resultThread);
thread.start();
threadpool_feeder_lifecycle(filesList, this);
resultThread.bExit = true;
while (!resultThread.bClosed) {
try {
Thread.sleep( 100 );
} catch (InterruptedException e) {
e.printStackTrace();
}
}
calucate_runstats();
if (cloner != null) {
try {
cloner.close();
} catch (IOException e) {
e.printStackTrace();
}
cloner = null;
}
exceptionsOutput.close();
validOutput.acquire();
validOutput.out.println( "#" );
validOutput.out.println( "# Job summary" );
validOutput.out.println( "#" );
validOutput.out.println( "GZip files: " + gzFiles );
validOutput.out.println( " + Arc: " + arcGzFiles );
validOutput.out.println( " + Warc: " + warcGzFiles );
validOutput.out.println( " Arc files: " + arcFiles );
validOutput.out.println( "Warc files: " + warcFiles );
validOutput.out.println( " Errors: " + errors );
validOutput.out.println( " Warnings: " + warnings );
validOutput.out.println( "RuntimeErr: " + runtimeErrors );
validOutput.out.println( " Skipped: " + skipped );
validOutput.out.println( " Time: " + run_timestr + " (" + run_dtm + " ms.)" );
validOutput.out.println( "TotalBytes: " + toSizeString(current_size));
validOutput.out.println( " AvgBytes: " + toSizePerSecondString(run_avgbpsec));
validOutput.release();
validOutput.close();
invalidOutput.acquire();
invalidOutput.out.println( "#" );
invalidOutput.out.println( "# Job summary" );
invalidOutput.out.println( "#" );
invalidOutput.out.println( "GZip files: " + gzFiles );
invalidOutput.out.println( " + Arc: " + arcGzFiles );
invalidOutput.out.println( " + Warc: " + warcGzFiles );
invalidOutput.out.println( " Arc files: " + arcFiles );
invalidOutput.out.println( "Warc files: " + warcFiles );
invalidOutput.out.println( " Errors: " + errors );
invalidOutput.out.println( " Warnings: " + warnings );
invalidOutput.out.println( "RuntimeErr: " + runtimeErrors );
invalidOutput.out.println( " Skipped: " + skipped );
invalidOutput.out.println( " Time: " + run_timestr + " (" + run_dtm + " ms.)" );
invalidOutput.out.println( "TotalBytes: " + toSizeString(current_size));
invalidOutput.out.println( " AvgBytes: " + toSizePerSecondString(run_avgbpsec));
invalidOutput.release();
invalidOutput.close();
cout.println( "#" );
cout.println( "# Job summary" );
cout.println( "#" );
cout.println( "GZip files: " + gzFiles );
cout.println( " + Arc: " + arcGzFiles );
cout.println( " + Warc: " + warcGzFiles );
cout.println( " Arc files: " + arcFiles );
cout.println( "Warc files: " + warcFiles );
cout.println( " Errors: " + errors );
cout.println( " Warnings: " + warnings );
cout.println( "RuntimeErr: " + runtimeErrors );
cout.println( " Skipped: " + skipped );
cout.println( " Time: " + run_timestr + " (" + run_dtm + " ms.)" );
cout.println( "TotalBytes: " + toSizeString(current_size));
cout.println( " AvgBytes: " + toSizePerSecondString(run_avgbpsec));
List> typeNumbersList = new ArrayList>(typeNumbers.entrySet());
//Collections.sort(typeNumbersList, new EntryDiagnosisTypeComparator());
Collections.sort(typeNumbersList, new EntryComparator());
Entry typeNumberEntry;
for (int i=0; i> entityNumbersList = new ArrayList>(entityNumbers.entrySet());
//Collections.sort(entityNumbersList, new EntryStringComparator());
Collections.sort(entityNumbersList, new EntryComparator());
Entry entityNumberEntry;
for (int i=0; i> implements Comparator> {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o1.getKey().compareTo(o2.getKey());
}
}
/*
public class EntryDiagnosisTypeComparator implements Comparator {
@Override
public int compare(Object o1, Object o2) {
Entry e1 = (Entry)o1;
Entry e2 = (Entry)o2;
return e1.getKey().compareTo(e2.getKey());
}
}
public class EntryStringComparator implements Comparator {
@Override
public int compare(Object o1, Object o2) {
Entry e1 = (Entry)o1;
Entry e2 = (Entry)o2;
return e1.getKey().compareTo(e2.getKey());
}
}
*/
@Override
public synchronized void process(File srcFile) {
if (srcFile.lastModified() > after) {
FileIdent fileIdent = FileIdent.ident(srcFile);
if (srcFile.length() > 0) {
// debug
//System.out.println(fileIdent.filenameId + " " + fileIdent.streamId + " " + srcFile.getName());
if (fileIdent.filenameId != fileIdent.streamId) {
cout.println("Wrong extension: '" + srcFile.getPath() + "'");
}
switch (fileIdent.streamId) {
case FileIdent.FILEID_GZIP:
case FileIdent.FILEID_ARC:
case FileIdent.FILEID_ARC_GZ:
case FileIdent.FILEID_WARC:
case FileIdent.FILEID_WARC_GZ:
executor.submit(new TaskRunnable(srcFile));
queued_size += srcFile.length();
++queued;
break;
default:
break;
}
} else {
switch (fileIdent.filenameId) {
case FileIdent.FILEID_GZIP:
case FileIdent.FILEID_ARC:
case FileIdent.FILEID_ARC_GZ:
case FileIdent.FILEID_WARC:
case FileIdent.FILEID_WARC_GZ:
cout.println("Empty file: '" + srcFile.getPath() + "'");
break;
default:
break;
}
}
}
}
class TaskRunnable implements Runnable {
File srcFile;
TaskRunnable(File srcFile) {
this.srcFile = srcFile;
}
@Override
public void run() {
TestFile2 testFile = new TestFile2();
testFile.bShowErrors = bShowErrors;
testFile.bValidateDigest = bValidateDigest;
testFile.uriProfile = uriProfile;
testFile.recordHeaderMaxSize = recordHeaderMaxSize;
testFile.payloadHeaderMaxSize = payloadHeaderMaxSize;
testFile.validatorPlugins = validatorPlugins;
testFile.callback = null;
TestFileResult result = testFile.processFile(srcFile, cloner);
result.srcFile = srcFile;
results.add(result);
resultsReady.release();
}
}
/*
class TestCallable implements Callable {
File srcFile;
TestCallable(File srcFile) {
this.srcFile = srcFile;
}
@Override
public TestFileResult call() throws Exception {
TestFileResult result = testFile.processFile(srcFile, bShowErrors, null);
results.add(result);
resultsReady.release();
return result;
}
}
*/
/** Results ready resource semaphore. */
private Semaphore resultsReady = new Semaphore(0);
/** Completed validation results list. */
private ConcurrentLinkedQueue results = new ConcurrentLinkedQueue();
class ResultThread implements Runnable {
boolean bExit = false;
boolean bClosed = false;
@Override
public void run() {
TestFileResult result;
File newFile;
cout.println("Output Thread started.");
boolean bLoop = true;
while (bLoop) {
try {
if (resultsReady.tryAcquire(1, TimeUnit.SECONDS)) {
result = results.poll();
validOutput.acquire();
invalidOutput.acquire();
exceptionsOutput.acquire();
try {
result.printResult(bShowErrors, validOutput.out, invalidOutput.out, exceptionsOutput.out);
if (bBad) {
if (result.rdList.size() > 0 || result.throwableList.size() > 0) {
if (!result.srcFile.getName().endsWith(".bad")) {
newFile = new File(result.srcFile.getParent(), result.srcFile.getName() + ".bad");
if (!result.srcFile.renameTo(newFile)) {
cout.println(String.format("Could not renamed '%s' to '%s'", result.srcFile.getPath(), newFile.getPath()));
}
}
}
}
}
catch (Throwable t) {
++result.runtimeErrors;
t.printStackTrace();
}
exceptionsOutput.release();
invalidOutput.release();
validOutput.release();
update_summary(result);
current_size += result.srcFile.length();
++processed;
calculate_progress();
//cout.print_progress("Queued: " + queued + " - Processed: " + processed + " - Estimated: " + new Date(ctm + etm).toString() + ".");
cout.print_progress(String.format("Queued: %d - Processed: %d - %s - Estimated: %s (%.2f%%).", queued, processed, toSizePerSecondString(current_avgbpsec), current_timestr, current_progress));
} else if (bExit && processed == queued) {
bLoop = false;
}
} catch (InterruptedException e) {
bLoop = false;
}
}
cout.println("Output Thread stopped.");
bClosed = true;
}
}
Map typeNumbers = new TreeMap();
Map entityNumbers = new HashMap();
public void update_summary(TestFileResult result) {
arcGzFiles += result.arcGzFiles;
warcGzFiles += result.warcGzFiles;
gzFiles += result.gzFiles;
arcFiles += result.arcFiles;
warcFiles += result.warcFiles;
runtimeErrors += result.runtimeErrors;
skipped += result.skipped;
errors += result.gzipErrors;
warnings += result.gzipWarnings;
errors += result.arcErrors;
warnings += result.arcWarnings;
errors += result.warcErrors;
warnings += result.warcWarnings;
List resultDiagnoses = result.rdList;
TestFileResultItemDiagnosis resultDiagnosis;
List diagnoses;
Diagnosis diagnosis;
Integer number;
if (resultDiagnoses != null) {
for (int i=0; i