Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.archive.format.gzip.zipnum.ZipNumIndex Maven / Gradle / Ivy
package org.archive.format.gzip.zipnum;
import java.io.IOException;
import java.util.logging.Logger;
import org.archive.format.cdx.CDXInputSource;
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.binsearch.FieldExtractingSLR;
import org.archive.util.binsearch.SeekableLineReader;
import org.archive.util.binsearch.SortedTextFile;
import org.archive.util.iterator.BoundedStringIterator;
import org.archive.util.iterator.CloseableIterator;
import org.archive.util.iterator.StartBoundedStringIterator;
public class ZipNumIndex implements CDXInputSource {
final static Logger LOGGER = Logger.getLogger(ZipNumIndex.class.getName());
protected String pathRoot;
protected String summaryFile;
protected int binsearchBlockSize = 8192;
protected int readaheadSize = 512;
protected SortedTextFile summary;
protected boolean required = true;
protected ZipNumBlockLoader blockLoader;
// Used only for reference / user info
protected int cdxLinesPerBlock = 3000;
protected long cdxLinesTotalCount = 0;
//protected HashMap locMap = null;
protected final static boolean DEFAULT_USE_NIO = true;
private static final int LINE_COUNT_FIELD = 4;
protected boolean useNio = DEFAULT_USE_NIO;
public ZipNumIndex()
{
}
public void init() throws IOException {
if (summaryFile != null) {
this.summary = new SortedTextFile(GeneralURIStreamFactory.createSeekableStreamFactory(summaryFile, readaheadSize, useNio));
this.summary.setBinsearchBlockSize(binsearchBlockSize);
}
if (blockLoader == null) {
this.blockLoader = new ZipNumBlockLoader();
}
}
public static ZipNumIndex createIndexWithSummaryPath(String summaryFile) throws IOException
{
ZipNumIndex zipIndex = new ZipNumIndex();
zipIndex.setSummaryFile(summaryFile);
zipIndex.init();
return zipIndex;
}
public static ZipNumIndex createIndexWithBasePath(String pathRoot) throws IOException
{
ZipNumIndex zipIndex = new ZipNumIndex();
zipIndex.setPathRoot(pathRoot);
zipIndex.init();
return zipIndex;
}
public static int extractLineCount(String line)
{
return (int)extractLongField(line, LINE_COUNT_FIELD);
}
protected static long extractLongField (String line, int index)
{
String[] parts = line.split("\t");
if (parts.length <= index) {
return -1;
}
long count = -1;
try {
count = Long.parseLong(parts[index]);
} catch (NumberFormatException n) {
}
return count;
}
public int getNumLines(String[] blocks)
{
if (blocks.length < 2) {
return 0;
}
int lastLine = -1;
int line = -1;
int size = 0;
for (String block : blocks) {
lastLine = line;
line = extractLineCount(block);
if (lastLine >= 0) {
size += (line - lastLine);
}
}
return size;
}
public long getTotalLength(String[] blocks)
{
long size = 0;
for (String block : blocks) {
size += extractLongField(block, 3);
}
return size;
}
public int getNumLines(String start, String end) throws IOException
{
SeekableLineReader slr = null;
String startLine = null;
String endLine = null;
int startCount = 0;
int endCount = 0;
try {
slr = summary.getSLR();
long[] offsets = summary.getStartEndOffsets(slr, start, end);
if (offsets[0] > 0) {
slr.seek(offsets[0]);
slr.readLine();
startLine = slr.readLine();
}
if (offsets[1] < slr.getSize()) {
slr.seek(offsets[1]);
slr.readLine();
endLine = slr.readLine();
}
// Get the last line
if (endLine == null) {
endLine = summary.getLastLine(slr);
}
if (endLine != null) {
endCount = extractLineCount(endLine);
}
if (startLine != null) {
startCount = extractLineCount(startLine);
}
} finally {
if (slr != null) {
slr.close();
}
}
return endCount - startCount;
}
public static class PageResult
{
final public CloseableIterator iter;
final public int numPages;
PageResult(CloseableIterator iter, int numPages)
{
this.iter = iter;
this.numPages = numPages;
}
}
public PageResult getNthPage(String[] startEnd, int page, int pageSize, boolean numPagesOnly) throws IOException
{
String startEndIdx[] = getSummary().getRange(startEnd[0], startEnd[1]);
int firstLineNumber = extractLineCount(startEndIdx[0]);
int endLineNumber = extractLineCount(startEndIdx[1]) + 1;
int totalLines = endLineNumber - firstLineNumber;
int numPages = ((totalLines - 1) / pageSize) + 1;
if (numPages < 1) {
numPages = 1;
}
if (numPagesOnly) {
return new PageResult(null, numPages);
}
if (page >= numPages) {
return new PageResult(null, numPages);
}
int firstPageLineNumber = (page * pageSize) + firstLineNumber;
int lastPageLineNumber = Math.min(firstPageLineNumber + pageSize, endLineNumber);
if (page > 0) {
startEndIdx[0] = getNthLine("" + firstPageLineNumber, LINE_COUNT_FIELD);
}
boolean endInclusive = false;
if (page < (numPages - 1)) {
startEndIdx[1] = getNthLine("" + lastPageLineNumber, LINE_COUNT_FIELD);
} else {
endInclusive = true;
}
CloseableIterator blocklines = getClusterRange(startEndIdx[0], startEndIdx[1], endInclusive, false);
return new PageResult(blocklines, numPages);
}
public String getNthLine(String lineNumber, int lineField) throws IOException
{
SeekableLineReader slr = null;
try {
slr = summary.getSLR();
FieldExtractingSLR lineCountReader = new FieldExtractingSLR(slr, lineField, "\t");
long offset = summary.binaryFindOffset(lineCountReader, lineNumber, SortedTextFile.numericComparator);
slr.seek(offset);
if (offset > 0) {
slr.skipLine();
}
String fullLine = null;
String prevLine = null;
while (true) {
prevLine = fullLine;
fullLine = slr.readLine();
if (fullLine == null) {
fullLine = prevLine;
break;
}
String currLineNumber = fullLine.split("\t")[lineField];
if (SortedTextFile.numericComparator.compare(lineNumber, currLineNumber) <= 0) {
break;
}
}
return fullLine;
} finally {
if (slr != null) {
slr.close();
}
}
}
//TODO: Experimental?
public long getEstimateSplitSize(String[] blocks)
{
String parts[] = null, lastParts[] = null;
long totalSize = 0;
for (String block : blocks) {
lastParts = parts;
parts = block.split("\t");
if ((lastParts != null) && (parts.length >= 3) && (lastParts.length >= 3)) {
// If same shard, simply subtract
long newOffset = Long.parseLong(parts[2]);
if (parts[1].equals(lastParts[1])) {
long lastOffset = Long.parseLong(lastParts[2]);
totalSize += (newOffset - lastOffset);
} else {
totalSize += newOffset;
//TODO: Compute size of all in between shards
//computeBlockSizeDiff();
}
}
}
return totalSize;
}
public CloseableIterator getClusterRange(String start, String end, boolean inclusive, boolean includePrevLine) throws IOException
{
CloseableIterator iter = null;
iter = summary.getRecordIterator(start, includePrevLine);
return wrapEndIterator(iter, end, inclusive);
//return wrapStartEndIterator(iter, start, end, inclusive);
}
public static CloseableIterator wrapStartEndIterator(CloseableIterator iter, String start, String end, boolean inclusive)
{
return wrapEndIterator(wrapStartIterator(iter, start), end, inclusive);
}
public static CloseableIterator wrapReverseIterator(CloseableIterator iter, String start, String end)
{
iter = new StartBoundedStringIterator(iter, end, true);
iter = new BoundedStringIterator(iter, start, false, true);
return iter;
}
public static CloseableIterator wrapStartIterator(CloseableIterator iter, String start)
{
return new StartBoundedStringIterator(iter, start);
}
public static CloseableIterator wrapEndIterator(CloseableIterator iter, String end, boolean inclusive)
{
if (end.isEmpty()) {
return iter;
} else {
return new BoundedStringIterator(iter, end, inclusive);
}
}
public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, String start, String end, int split, int numSplits)
{
return getCDXIterator(summaryIterator, start, end, split, numSplits, null);
}
public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, String start, String end, int split, int numSplits, ZipNumParams params)
{
CloseableIterator blocklines = this.getCDXIterator(summaryIterator, params);
if ((split == 0) && (start != null) && !start.isEmpty()) {
blocklines = wrapStartIterator(blocklines, start);
}
if ((split >= (numSplits - 1)) && (end != null) && !end.isEmpty()) {
blocklines = wrapEndIterator(blocklines, end, false);
}
return blocklines;
}
public static String endKey(String key)
{
return key + "!";
}
public CloseableIterator getLastBlockCDXLineIterator(String key) throws IOException {
// the next line after last key is key! so this will return last key block
CloseableIterator summaryIter = summary.getRecordIteratorLT(endKey(key));
return wrapStartIterator(getCDXIterator(summaryIter), key);
}
public static CloseableIterator wrapPrefix(CloseableIterator source, String prefix, boolean exact)
{
if (exact) {
return wrapEndIterator(source, endKey(prefix), false);
} else {
return wrapEndIterator(source, prefix, true);
}
}
public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException {
CloseableIterator summaryIter = summary.getRecordIteratorLT(key);
if (params.getTimestampDedupLength() > 0) {
summaryIter = new TimestampDedupIterator(summaryIter, params.getTimestampDedupLength());
}
if (end != null && !end.isEmpty()) {
summaryIter = wrapEndIterator(summaryIter, end, false);
}
if (blockLoader.isBufferFully() && (params != null) && (params.getMaxBlocks() > 0)) {
LineBufferingIterator lineBufferIter = new LineBufferingIterator(summaryIter, params.getMaxBlocks(), params.isReverse());
lineBufferIter.bufferInput();
summaryIter = lineBufferIter;
}
if (params.isReverse()) {
return wrapReverseIterator(getCDXIterator(summaryIter, params), start, endKey(key));
} else {
return wrapStartEndIterator(getCDXIterator(summaryIter, params), start, end, false);
}
}
//TODO: replace with matchType version
public CloseableIterator getCDXIterator(String key, String start, boolean exact, ZipNumParams params) throws IOException {
CloseableIterator summaryIter = summary.getRecordIteratorLT(key);
if (params.getTimestampDedupLength() > 0) {
summaryIter = new TimestampDedupIterator(summaryIter, params.getTimestampDedupLength());
}
summaryIter = wrapPrefix(summaryIter, start, exact);
if (blockLoader.isBufferFully() && (params != null) && (params.getMaxBlocks() > 0)) {
LineBufferingIterator lineBufferIter = new LineBufferingIterator(summaryIter, params.getMaxBlocks(), params.isReverse());
lineBufferIter.bufferInput();
summaryIter = lineBufferIter;
}
return wrapStartIterator(getCDXIterator(summaryIter, params), start);
}
public CloseableIterator getCDXIterator(String key, ZipNumParams params) throws IOException {
CloseableIterator summaryIter = summary.getRecordIteratorLT(key);
return wrapStartIterator(getCDXIterator(summaryIter, params), key);
}
public CloseableIterator getCDXIterator(CloseableIterator summaryIterator, ZipNumParams params)
{
SummaryBlockIterator blockIter = new SummaryBlockIterator(summaryIterator, this, params);
MultiBlockIterator zipIter = new MultiBlockIterator(blockIter);
return zipIter;
}
public CloseableIterator getCDXIterator(CloseableIterator summaryIterator)
{
return getCDXIterator(summaryIterator, null);
}
public void setSummaryFile(String summaryFile) {
this.summaryFile = summaryFile;
}
public String getSummaryFile() {
return summaryFile;
}
public SortedTextFile getSummary()
{
return summary;
}
public ZipNumBlockLoader getBlockLoader() {
return blockLoader;
}
public int getBinsearchBlockSize() {
return binsearchBlockSize;
}
public void setBinsearchBlockSize(int binsearchBlockSize) {
this.binsearchBlockSize = binsearchBlockSize;
}
public int getReadaheadSize() {
return readaheadSize;
}
public void setReadaheadSize(int readaheadSize) {
this.readaheadSize = readaheadSize;
}
public void setBlockLoader(ZipNumBlockLoader blockLoader) {
this.blockLoader = blockLoader;
}
public boolean isUseNio() {
return useNio;
}
public void setUseNio(boolean useNio) {
this.useNio = useNio;
}
public int getCdxLinesPerBlock() {
return cdxLinesPerBlock;
}
public void setCdxLinesPerBlock(int cdxLinesPerBlock) {
this.cdxLinesPerBlock = cdxLinesPerBlock;
}
String getReaderPath(String partId) {
if (pathRoot == null) {
int lastSlash = summaryFile.lastIndexOf('/');
pathRoot = this.summaryFile.substring(0, lastSlash + 1);
}
if (!partId.endsWith(".gz")) {
partId += ".gz";
}
String gzFile = pathRoot + partId;
return gzFile;
}
SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) {
String path = getReaderPath(partId);
return blockLoader.attemptLoadBlock(path, startOffset, totalLength, true, this.isRequired());
}
public String getPathRoot() {
return pathRoot;
}
public void setPathRoot(String pathRoot) {
this.pathRoot = pathRoot;
}
public boolean isRequired() {
return required;
}
public void setRequired(boolean required) {
this.required = required;
}
@Override
public long getTotalLines() {
return cdxLinesTotalCount;
}
}