All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.format.gzip.zipnum.ZipNumIndex Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
package org.archive.format.gzip.zipnum;

import java.io.IOException;
import java.util.logging.Logger;

import org.archive.format.cdx.CDXInputSource;
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.binsearch.FieldExtractingSLR;
import org.archive.util.binsearch.SeekableLineReader;
import org.archive.util.binsearch.SortedTextFile;
import org.archive.util.iterator.BoundedStringIterator;
import org.archive.util.iterator.CloseableIterator;
import org.archive.util.iterator.StartBoundedStringIterator;

public class ZipNumIndex implements CDXInputSource {
	final static Logger LOGGER = Logger.getLogger(ZipNumIndex.class.getName());

	protected String pathRoot;
		
	protected String summaryFile;
	protected int binsearchBlockSize = 8192;
	protected int readaheadSize = 512;
	protected SortedTextFile summary;
	
	protected boolean required = true;
	
	protected ZipNumBlockLoader blockLoader;
	
	// Used only for reference / user info
	protected int cdxLinesPerBlock = 3000;
	
	
	protected long cdxLinesTotalCount = 0;
	
	//protected HashMap locMap = null;
		
	protected final static boolean DEFAULT_USE_NIO = true;

	private static final int LINE_COUNT_FIELD = 4;
	
	protected boolean useNio = DEFAULT_USE_NIO;
	
	public ZipNumIndex()
	{
		
	}
					
	public void init() throws IOException {
		
		if (summaryFile != null) {
			this.summary = new SortedTextFile(GeneralURIStreamFactory.createSeekableStreamFactory(summaryFile, readaheadSize, useNio));
			this.summary.setBinsearchBlockSize(binsearchBlockSize);
		}
						
		if (blockLoader == null) {
			this.blockLoader = new ZipNumBlockLoader();
		}
	}
	
	public static ZipNumIndex createIndexWithSummaryPath(String summaryFile) throws IOException
	{
		ZipNumIndex zipIndex = new ZipNumIndex();
		zipIndex.setSummaryFile(summaryFile);
		zipIndex.init();
		return zipIndex;
	}
	
	public static ZipNumIndex createIndexWithBasePath(String pathRoot) throws IOException
	{
		ZipNumIndex zipIndex = new ZipNumIndex();
		zipIndex.setPathRoot(pathRoot);
		zipIndex.init();
		return zipIndex;
	}
				
	public static int extractLineCount(String line)
	{
		return (int)extractLongField(line, LINE_COUNT_FIELD);
	}
	
	protected static long extractLongField (String line, int index)
	{
		String[] parts = line.split("\t");
		
		if (parts.length <= index) {
			return -1;
		}
		
		long count = -1;
		
		try {
			count = Long.parseLong(parts[index]);
		} catch (NumberFormatException n) {

		}
		
		return count;
	}
	
	public int getNumLines(String[] blocks)
	{
		if (blocks.length < 2) {
			return 0;
		}
		
		int lastLine = -1;
		int line = -1;
		
		int size = 0;
		
		for (String block : blocks) {
			lastLine = line;
			line = extractLineCount(block);
				
			if (lastLine >= 0) {
				size += (line - lastLine);
			}
		}
		
		return size;
	}
	
	public long getTotalLength(String[] blocks)
	{	
		long size = 0;
		
		for (String block : blocks) {
			size += extractLongField(block, 3);
		}
		
		return size;
	}
	
	public int getNumLines(String start, String end) throws IOException
	{
		SeekableLineReader slr = null;
		String startLine = null;
		String endLine = null;
		
		int startCount = 0;
		int endCount = 0;
		
		try {
			slr = summary.getSLR();
		
			long[] offsets = summary.getStartEndOffsets(slr, start, end);
			
			if (offsets[0] > 0) {
				slr.seek(offsets[0]);
				slr.readLine();
				
				startLine = slr.readLine();
			}
			
			if (offsets[1] < slr.getSize()) {
				slr.seek(offsets[1]);
				slr.readLine();
			
				endLine = slr.readLine();
			}
			
			// Get the last line
			if (endLine == null) {
				endLine = summary.getLastLine(slr);
			}
			
			if (endLine != null) {
				endCount = extractLineCount(endLine);
			}
			
			if (startLine != null) {
				startCount = extractLineCount(startLine);
			}
			
		} finally {
			if (slr != null) {
				slr.close();
			}
		}
		
		return endCount - startCount;
	}
	
	public static class PageResult
	{
		final public CloseableIterator iter;
		final public int numPages;
		
		PageResult(CloseableIterator iter, int numPages)
		{
			this.iter = iter;
			this.numPages = numPages;
		}
	}
	
	public PageResult getNthPage(String[] startEnd, int page, int pageSize, boolean numPagesOnly) throws IOException
	{
		String startEndIdx[] = getSummary().getRange(startEnd[0], startEnd[1]);
		
		int firstLineNumber = extractLineCount(startEndIdx[0]);
		int endLineNumber = extractLineCount(startEndIdx[1]) + 1;
		int totalLines = endLineNumber - firstLineNumber;
		
		int numPages = ((totalLines - 1) / pageSize) + 1;
		
		if (numPages < 1) {
			numPages = 1;
		}
		
		if (numPagesOnly) {
			return new PageResult(null, numPages);
		}
		
		if (page >= numPages) {
			return new PageResult(null, numPages);
		}
		
		int firstPageLineNumber = (page * pageSize) + firstLineNumber;
		int lastPageLineNumber = Math.min(firstPageLineNumber + pageSize, endLineNumber);
		
		if (page > 0) {
			startEndIdx[0] = getNthLine("" + firstPageLineNumber, LINE_COUNT_FIELD);
		}
		
		boolean endInclusive = false;
		
		if (page < (numPages - 1)) {
			startEndIdx[1] = getNthLine("" + lastPageLineNumber, LINE_COUNT_FIELD);
		} else {
			endInclusive = true;
		}
	
    	CloseableIterator blocklines = getClusterRange(startEndIdx[0], startEndIdx[1], endInclusive, false);
    	return new PageResult(blocklines, numPages);
	}
	
	public String getNthLine(String lineNumber, int lineField) throws IOException
	{
		SeekableLineReader slr = null;
		
		try {
			slr = summary.getSLR();
			FieldExtractingSLR lineCountReader = new FieldExtractingSLR(slr, lineField, "\t");
			
			long offset = summary.binaryFindOffset(lineCountReader, lineNumber, SortedTextFile.numericComparator);
			slr.seek(offset);
			
			if (offset > 0) {
				slr.skipLine();
			}
			
			String fullLine = null;
			String prevLine = null;
			
		    while (true) {
		    	prevLine = fullLine;
		    	fullLine = slr.readLine();
		    	
		    	if (fullLine == null) {
		    		fullLine = prevLine;
		    		break;
		    	}
		    	
		    	String currLineNumber = fullLine.split("\t")[lineField];
		    	
		    	if (SortedTextFile.numericComparator.compare(lineNumber, currLineNumber) <= 0) {
		    		break;
		    	}
		    }
		    
			return fullLine;
			
		} finally {
			if (slr != null) {
				slr.close();
			}
		}
	}
	
	//TODO: Experimental?
	public long getEstimateSplitSize(String[] blocks)
	{
		String parts[] = null, lastParts[] = null;
		
		long totalSize = 0;
		
		for (String block : blocks) {
			lastParts = parts;
			parts = block.split("\t");
			
			if ((lastParts != null) && (parts.length >= 3) && (lastParts.length >= 3)) {
				// If same shard, simply subtract
				long newOffset = Long.parseLong(parts[2]);
				
				if (parts[1].equals(lastParts[1])) {
					long lastOffset = Long.parseLong(lastParts[2]);
					totalSize += (newOffset - lastOffset);
				} else {
					totalSize += newOffset;
					//TODO: Compute size of all in between shards
					//computeBlockSizeDiff();
				}
			}
		}
		
		return totalSize;
	}
	
	public CloseableIterator getClusterRange(String start, String end, boolean inclusive, boolean includePrevLine) throws IOException
	{
		CloseableIterator iter = null;
		iter = summary.getRecordIterator(start, includePrevLine);
		return wrapEndIterator(iter, end, inclusive);
		//return wrapStartEndIterator(iter, start, end, inclusive);
	}
	
	public static CloseableIterator wrapStartEndIterator(CloseableIterator iter, String start, String end, boolean inclusive)
	{
		return wrapEndIterator(wrapStartIterator(iter, start), end, inclusive);
	}
	
	public static CloseableIterator wrapReverseIterator(CloseableIterator iter, String start, String end)
	{
		iter = new StartBoundedStringIterator(iter, end, true);
		iter = new BoundedStringIterator(iter, start, false, true);
		return iter;
	}
	
	public static CloseableIterator wrapStartIterator(CloseableIterator iter, String start)
	{
		return new StartBoundedStringIterator(iter, start);
	}
	
	public static CloseableIterator wrapEndIterator(CloseableIterator iter, String end, boolean inclusive)
	{		
		if (end.isEmpty()) {
			return iter;
		} else {
			return new BoundedStringIterator(iter, end, inclusive);	
		}
	}
	
	public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, String start, String end, int split, int numSplits)	
	{
		return getCDXIterator(summaryIterator, start, end, split, numSplits, null);
	}

	public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, String start, String end, int split, int numSplits, ZipNumParams params)	
	{
		CloseableIterator blocklines = this.getCDXIterator(summaryIterator, params);
		
		if ((split == 0) && (start != null) && !start.isEmpty()) {
			blocklines = wrapStartIterator(blocklines, start);
		}
		
		if ((split >= (numSplits - 1)) && (end != null) && !end.isEmpty()) {
			blocklines = wrapEndIterator(blocklines, end, false);
		}
		
		return blocklines;
	}
	
	public static String endKey(String key)
	{
		return key + "!";
	}
	
	public CloseableIterator getLastBlockCDXLineIterator(String key) throws IOException {
		// the next line after last key is key! so this will return last key block
		CloseableIterator summaryIter = summary.getRecordIteratorLT(endKey(key));
		
		return wrapStartIterator(getCDXIterator(summaryIter), key);
	}
	
	public static CloseableIterator wrapPrefix(CloseableIterator source, String prefix, boolean exact)
	{
		if (exact) {
			return wrapEndIterator(source, endKey(prefix), false);
		} else {
			return wrapEndIterator(source, prefix, true);
		}
	}
	
	public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException {	
		CloseableIterator summaryIter = summary.getRecordIteratorLT(key);
		
		if (params.getTimestampDedupLength() > 0) {
			summaryIter = new TimestampDedupIterator(summaryIter, params.getTimestampDedupLength());
		}
		
		if (end != null && !end.isEmpty()) {
			summaryIter = wrapEndIterator(summaryIter, end, false);
		}
		
		if (blockLoader.isBufferFully() && (params != null) && (params.getMaxBlocks() > 0)) {
			LineBufferingIterator lineBufferIter = new LineBufferingIterator(summaryIter, params.getMaxBlocks(), params.isReverse());
			lineBufferIter.bufferInput();
			summaryIter = lineBufferIter;
		}
		
		if (params.isReverse()) {
			return wrapReverseIterator(getCDXIterator(summaryIter, params), start, endKey(key));
		} else {
			return wrapStartEndIterator(getCDXIterator(summaryIter, params), start, end, false);
		}
	}
	
	
	//TODO: replace with matchType version
	public CloseableIterator getCDXIterator(String key, String start, boolean exact, ZipNumParams params) throws IOException {
		
		CloseableIterator summaryIter = summary.getRecordIteratorLT(key);
		
		if (params.getTimestampDedupLength() > 0) {
			summaryIter = new TimestampDedupIterator(summaryIter, params.getTimestampDedupLength());
		}
		
		summaryIter = wrapPrefix(summaryIter, start, exact);
		
		if (blockLoader.isBufferFully() && (params != null) && (params.getMaxBlocks() > 0)) {
			LineBufferingIterator lineBufferIter = new LineBufferingIterator(summaryIter, params.getMaxBlocks(), params.isReverse());
			lineBufferIter.bufferInput();
			summaryIter = lineBufferIter;
		}
		
		return wrapStartIterator(getCDXIterator(summaryIter, params), start);
	}
	
	public CloseableIterator getCDXIterator(String key, ZipNumParams params) throws IOException {
		
		CloseableIterator summaryIter = summary.getRecordIteratorLT(key);		
		return wrapStartIterator(getCDXIterator(summaryIter, params), key);
	}
	
	public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, ZipNumParams params)
	{
		SummaryBlockIterator blockIter = new SummaryBlockIterator(summaryIterator, this, params);
		MultiBlockIterator zipIter = new MultiBlockIterator(blockIter);
		return zipIter;
	}
	
	public CloseableIterator getCDXIterator(CloseableIterator summaryIterator)
	{
		return getCDXIterator(summaryIterator, null);
	}
	
	public void setSummaryFile(String summaryFile) {
		this.summaryFile = summaryFile;
	}

	public String getSummaryFile() {
		return summaryFile;
	}
	
	public SortedTextFile getSummary()
	{
		return summary;
	}

	public ZipNumBlockLoader getBlockLoader() {
		return blockLoader;
	}

	public int getBinsearchBlockSize() {
        return binsearchBlockSize;
    }

    public void setBinsearchBlockSize(int binsearchBlockSize) {
        this.binsearchBlockSize = binsearchBlockSize;
    }

    public int getReadaheadSize() {
		return readaheadSize;
	}

	public void setReadaheadSize(int readaheadSize) {
		this.readaheadSize = readaheadSize;
	}

	public void setBlockLoader(ZipNumBlockLoader blockLoader) {
		this.blockLoader = blockLoader;
	}

	public boolean isUseNio() {
		return useNio;
	}

	public void setUseNio(boolean useNio) {
		this.useNio = useNio;
	}

	public int getCdxLinesPerBlock() {
		return cdxLinesPerBlock;
	}

	public void setCdxLinesPerBlock(int cdxLinesPerBlock) {
		this.cdxLinesPerBlock = cdxLinesPerBlock;
	}

	String getReaderPath(String partId) {
		if (pathRoot == null) {
			int lastSlash = summaryFile.lastIndexOf('/');
			pathRoot = this.summaryFile.substring(0, lastSlash + 1);
		}
		
		if (!partId.endsWith(".gz")) {
			partId += ".gz";
		}
		
		String gzFile = pathRoot + partId;
		return gzFile;
	}
	
	SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) {
		String path = getReaderPath(partId);
		return blockLoader.attemptLoadBlock(path, startOffset, totalLength, true, this.isRequired());
	}

	public String getPathRoot() {
		return pathRoot;
	}

	public void setPathRoot(String pathRoot) {
		this.pathRoot = pathRoot;
	}

	public boolean isRequired() {
		return required;
	}

	public void setRequired(boolean required) {
		this.required = required;
	}

	@Override
    public long getTotalLines() {
		return cdxLinesTotalCount;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy