All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.format.cdx.CDXFile Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
package org.archive.format.cdx;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.archive.format.gzip.zipnum.ZipNumIndex;
import org.archive.format.gzip.zipnum.ZipNumParams;
import org.archive.streamcontext.Stream;
import org.archive.streamcontext.StreamWrappedInputStream;
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.binsearch.SeekableLineReaderFactory;
import org.archive.util.binsearch.SortedTextFile;
import org.archive.util.binsearch.impl.RandomAccessFileSeekableLineReaderFactory;
import org.archive.util.iterator.CloseableIterator;
import org.archive.util.zip.OpenJDK7GZIPInputStream;

public class CDXFile extends SortedTextFile implements CDXInputSource {

	public CDXFile(String uri) throws IOException {
		super(getUriFactory(uri, false));
	}

	public CDXFile(SeekableLineReaderFactory factory) {
		super(factory);
	}

	public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException {
	    if (start.equals(key)) {
	        return ZipNumIndex.wrapEndIterator(getRecordIterator(key), end, false);
	    } else {
	        return ZipNumIndex.wrapStartEndIterator(getRecordIterator(key), start, end, false);
	    }
	}
	
	public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException {
		return ZipNumIndex.wrapPrefix(getRecordIterator(key), prefix, exact);
	}
	
	public static SeekableLineReaderFactory getUriFactory(String uri, boolean decodeToTemp) throws IOException
	{
		if (decodeToTemp) {
			return new RandomAccessFileSeekableLineReaderFactory(decodeGZToTemp(uri));
		}
		
		return GeneralURIStreamFactory.createSeekableStreamFactory(uri, false);
	}
	
	// Decode gzipped cdx to a temporary file	
	public static File decodeGZToTemp(String uriGZ) throws IOException {
		final int BUFFER_SIZE = 8192;

		Stream stream = null;

		try {
			stream = GeneralURIStreamFactory.createStream(uriGZ);
			InputStream input = new StreamWrappedInputStream(stream);
			input = new OpenJDK7GZIPInputStream(input);

			File uncompressedCdx = File.createTempFile(uriGZ, ".cdx");
			FileOutputStream out = new FileOutputStream(uncompressedCdx, false);

			byte buff[] = new byte[BUFFER_SIZE];
			int numRead = 0;
			
			while ((numRead = input.read(buff)) > 0) {
				out.write(buff, 0, numRead);
			}
			
			out.flush();
			out.close();
			
			return uncompressedCdx;
			
		} finally {
			if (stream != null) {
				stream.close();
			}
		}
	}
	
	public static BufferedReader createStreamingLineReader(String uri, boolean gzipped) throws IOException
	{
		Stream stream = GeneralURIStreamFactory.createStream(uri);
		StreamWrappedInputStream swis = new StreamWrappedInputStream(stream);
		swis.setCloseOnClose(true);
		
		InputStream input = swis;
		
		if (gzipped) {
			input = new OpenJDK7GZIPInputStream(swis);	
		}
		
		BufferedReader reader = new BufferedReader(new InputStreamReader(input));
		return reader;
	}

	@Override
    public long getTotalLines() {
		//TODO: Implement
		return 0;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy