All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.tools.tasks.compress.CompressTask Maven / Gradle / Ivy

Go to download

JWAT-Tools uses the available JWAT libraries to make high level tasks available either from command-line or programmatically. Common tasks include: Test, Compress, Decompress, CDX, Arc2Warc. More specialised tasks include: Changed, ContainerMD, Delete, Extract, Interval, PathIndex, Unpack, Headers2CDX.

There is a newer version: 0.7.1
Show newest version
package org.jwat.tools.tasks.compress;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;

import org.jwat.archive.FileIdent;
import org.jwat.common.Base16;
import org.jwat.tools.tasks.ProcessTask;

public class CompressTask extends ProcessTask {

	public CompressTask() {
	}

	private CompressOptions options;

	/** Valid results output stream. */
	//private SynchronizedOutput validOutput;

	/** Invalid results output stream. */
	//private SynchronizedOutput invalidOutput;

	/** Exception output stream. */
	//private SynchronizedOutput exceptionsOutput;

	public void runtask(CompressOptions options) {
		this.options = options;

		ResultThread resultThread = new ResultThread();
		Thread thread = new Thread(resultThread);
		thread.start();

		threadpool_feeder_lifecycle( options.filesList, options.bQueueFirst, this, options.threads );

		resultThread.bExit = true;
		while (!resultThread.bClosed) {
			try {
				Thread.sleep( 100 );
			}
			catch (InterruptedException e) {
				e.printStackTrace();
			}
		}

		calculate_runstats();

		cout.println("         Time: " + run_timestr + " (" + run_dtm + " ms.)" );
		cout.println("   TotalBytes: " + toSizeString(current_size));
		cout.println("     AvgBytes: " + toSizePerSecondString(run_avgbpsec));
		cout.println(String.format("       Gained: %s (%.2f%%).", toSizeString(uncompressed - compressed), current_gain));
		cout.println("    Completed: " + completed);
		cout.println("  Incompleted: " + incomplete);
		cout.println("IntegrityFail: " + integrityFail);

		Iterator> schemesIter = schemesMap.entrySet().iterator();
		Entry schemeEntry;
		while (schemesIter.hasNext()) {
			schemeEntry = schemesIter.next();
			cout.println(schemeEntry.getKey() + " (" + schemeEntry.getValue() + ")");
		}
	}

	@Override
	public void process(File srcFile) {
		FileIdent fileIdent = FileIdent.ident(srcFile);
		if (srcFile.length() > 0) {
			// debug
			//System.out.println(fileIdent.filenameId + " " + fileIdent.streamId + " " + srcFile.getName());
			if (fileIdent.filenameId != fileIdent.streamId) {
				cout.println("Wrong extension: '" + srcFile.getPath() + "'");
			}
			switch (fileIdent.streamId) {
			case FileIdent.FILEID_UNKNOWN:
			case FileIdent.FILEID_ARC:
			case FileIdent.FILEID_WARC:
				executor.submit(new TaskRunnable(srcFile));
				queued_size += srcFile.length();
				++queued;
				break;
			default:
				break;
			}
			if (fileIdent.streamId != FileIdent.FILEID_GZIP && fileIdent.streamId != FileIdent.FILEID_ARC_GZ && fileIdent.streamId != FileIdent.FILEID_WARC_GZ) {
			}
		} else {
			switch (fileIdent.filenameId) {
			case FileIdent.FILEID_UNKNOWN:
			case FileIdent.FILEID_ARC:
			case FileIdent.FILEID_WARC:
				cout.println("Empty file: '" + srcFile.getPath() + "'");
				break;
			default:
				break;
			}
		}
	}

	class TaskRunnable implements Runnable {
		File srcFile;
		TaskRunnable(File srcFile) {
			this.srcFile = srcFile;
		}
		@Override
		public void run() {
			CompressFile compressFile = new CompressFile();
			CompressResult compressionResult = compressFile.compressFile(srcFile, options);
			results.add(compressionResult);
			resultsReady.release();
		}
	}

	/** Results ready resource semaphore. */
	private Semaphore resultsReady = new Semaphore(0);

	/** Completed Compressed results list. */
	private ConcurrentLinkedQueue results = new ConcurrentLinkedQueue();

	protected Map schemesMap = new HashMap();

	private long completed = 0;

	private long incomplete = 0;

	private long integrityFail = 0;

	private long uncompressed = 0;

	private long compressed = 0;

	private double current_gain = 0.0;

	class ResultThread implements Runnable {

		boolean bExit = false;

		boolean bClosed = false;

		@Override
		public void run() {
			StringBuilder sb = new StringBuilder();
			CompressResult result;
			Iterator> schemesIter;
			Entry schemeEntry;
			String scheme;
			Long count;
			boolean bLoop = true;
			PrintWriter lstWriter = null;
			try {
				if (options.lstFile != null) {
					lstWriter = new PrintWriter(new BufferedWriter(new FileWriter(options.lstFile)));
				}
				while (bLoop) {
					try {
						if (resultsReady.tryAcquire(1, TimeUnit.SECONDS)) {
							result = results.poll();
							current_size += result.srcFile.length();
							++processed;

							if (result.schemesMap != null) {
								schemesIter = result.schemesMap.entrySet().iterator();
								while (schemesIter.hasNext()) {
									schemeEntry = schemesIter.next();
									scheme = schemeEntry.getKey();
									count = schemesMap.get(scheme);
									if (count == null) {
										count = 0L;
									}
									schemesMap.put(scheme, count + schemeEntry.getValue());
								}
							}

							if (result.bCompleted) {
								++completed;
								if (options.bVerify) {
						        	if (result.bVerified) {
						        		if (lstWriter != null) {
							        		sb.setLength(0);
								        	sb.append(result.srcFile.getName());
								        	sb.append(",");
								        	sb.append(result.srcFile.length());
								        	sb.append(",");
								        	sb.append(Base16.encodeArray(result.md5DigestBytesOrg));
								        	sb.append(",");
								        	sb.append(result.dstFile.getName());
								        	sb.append(",");
								        	sb.append(result.dstFile.length());
								        	sb.append(",");
								        	sb.append(Base16.encodeArray(result.md5compDigestBytesVerify));
								        	//cout.println(sb.toString());
								        	lstWriter.println(sb.toString());
						        		}
							        }
						        	else {
						        		++integrityFail;
										cout.print("Integrity fail: " + result.srcFile.getPath());
						        	}
								}
								if (!options.bVerify || result.bVerified) {
									uncompressed += result.srcFile.length();
						        	compressed += result.dstFile.length();
						        	if (uncompressed > 0) {
										current_gain = (double)(uncompressed - compressed) / (double)uncompressed * 100.0;
						        	}
								}
							}
							else {
								++incomplete;
								cout.print("Incomplete: " + result.srcFile.getPath());
							}

							result.dstFile.setLastModified(result.srcFile.lastModified());

							if (options.bDryrun) {
								result.dstFile.delete();
							}
							else if (options.bRemove) {
								if (result.bCompleted && result.bVerified) {
									result.srcFile.delete();
								}
							}

							calculate_progress();

					        //cout.print_progress("Queued: " + queued + " - Processed: " + processed + " - Estimated: " + new Date(ctm + etm).toString() + ".");
							cout.print_progress(String.format("Queued: %d - Processed: %d - %s - Estimated: %s (%.2f%%) - Saved: %s (%.2f%%).", queued, processed, toSizePerSecondString(current_avgbpsec), current_timestr, current_progress, toSizeString(uncompressed - compressed), current_gain));
						}
						else if (bExit && processed == queued) {
							bLoop = false;
						}
					}
					catch (InterruptedException e) {
						bLoop = false;
					}
				}
			}
			catch (Throwable t) {
				t.printStackTrace();
				System.err.println("Fatality!");
			}
			if (lstWriter != null) {
				lstWriter.close();
			}
			bClosed = true;
		}

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy