All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.structures.indexing.BaseMetaIndexBuilder Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org/
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is CompressingMetaIndexBuilder.java
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald  (original contributor)
 */
package org.terrier.structures.indexing;

import gnu.trove.TObjectIntHashMap;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.Flushable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.MetaIndex;
import org.terrier.structures.collections.FSOrderedMapFile;
import org.terrier.structures.collections.FSOrderedMapFile.MapFileWriter;
import org.terrier.structures.collections.FSOrderedMapFile.MultiFSOMapWriter;
import org.terrier.structures.seralization.FixedSizeIntWritableFactory;
import org.terrier.structures.seralization.FixedSizeTextFactory;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.MemoryChecker;
import org.terrier.utility.RuntimeMemoryChecker;
/**
 * Abstract base class for compressed and uncompressed metaindex building
 * Properties:
 * 
    *
  • metaindex.compressed.max.data.in-mem.mb - maximum size that a meta index .zdata file will be kept in memory. Defaults to 400(mb).
  • *
  • metaindex.compressed.max.index.in-mem.mb - maximum size that a meta index .zdata file will be kept in memory. Defaults to 100(mb).
  • *
  • metaindex.compressed.reverse.allow.duplicates - set this property to true to suppress errors when a reverse meta value is not unique. Default false.
  • *
  • metaindex.compressed.crop.long - set this property to suppress errors with overlong Document metadata, while will instead be cropped.
  • *
* @since 3.0 * @author Craig Macdonald & Vassilis Plachouras */ public abstract class BaseMetaIndexBuilder extends MetaIndexBuilder implements Flushable { protected final Logger logger = LoggerFactory.getLogger(BaseMetaIndexBuilder.class); protected final int MAX_MB_IN_MEM_RETRIEVAL = Integer.parseInt(ApplicationSetup.getProperty("metaindex.compressed.max.data.in-mem.mb", "400")); protected final int MAX_INDEX_MB_IN_MEM_RETRIEVAL = Integer.parseInt(ApplicationSetup.getProperty("metaindex.compressed.max.index.in-mem.mb", "100")); protected final boolean REVERSE_ALLOW_DUPS = Boolean.parseBoolean(ApplicationSetup.getProperty("metaindex.compressed.reverse.allow.duplicates", "false")); protected final boolean CROP_LONG = Boolean.parseBoolean(ApplicationSetup.getProperty("metaindex.compressed.crop.long", "false")); protected final int REVERSE_KEY_LOOKUP_WRITING_BUFFER_SIZE = 20000; protected final int DOCS_PER_CHECK = ApplicationSetup.DOCS_CHECK_SINGLEPASS; protected final TObjectIntHashMap key2Index; protected DataOutputStream dataOutput = null; protected final String[] keyNames; protected final int keyCount; protected ByteArrayOutputStream baos = new ByteArrayOutputStream(); protected DataOutputStream indexOutput = null; protected byte[] compressedBuffer = new byte[1024]; protected IndexOnDisk index; protected int[] valueLensChars; protected int[] valueLensBytes; protected byte[] spaces; protected int entryLengthBytes = 0; protected long currentOffset = 0; protected long currentIndexOffset = 0; protected int entryCount = 0; protected int[] reverseKeys; protected String[] reverseKeyNames; protected MapFileWriter[] reverseWriters; protected boolean[] valuesSorted; protected String[] lastValues; protected MemoryChecker memCheck = new RuntimeMemoryChecker(); protected FixedSizeWriteableFactory[] keyFactories; protected String structureName; protected Class structureClass; protected Class structureInputStreamClass; /** * constructor * @param _index * @param _keyNames * @param _valueLens * @param _reverseKeys */ public BaseMetaIndexBuilder(IndexOnDisk _index, String[] _keyNames, int[] _valueLens, String[] _reverseKeys) { this(_index, "meta", _keyNames, _valueLens, _reverseKeys); } /** * constructor * @param _index * @param _structureName * @param _keyNames * @param _valueLens * @param _reverseKeys */ @SuppressWarnings("unchecked") public BaseMetaIndexBuilder(IndexOnDisk _index, String _structureName, String[] _keyNames, int[] _valueLens, String[] _reverseKeys) { this.index = _index; this.structureName = _structureName; this.keyNames = _keyNames; this.valueLensChars = _valueLens; if (this.keyNames.length != this.valueLensChars.length) throw new IllegalArgumentException(this.getClass().getSimpleName() + " configuration incorrect: number of keys and number of value lengths are unequal: "+ Arrays.toString(keyNames) + " vs " + Arrays.toString(_valueLens)); this.key2Index = new TObjectIntHashMap(keyNames.length); this.keyCount = keyNames.length; for(int i=0;i 0); for(i=0;i data) throws IOException { String[] values = new String[keyCount]; int i=0; for(String keyName : keyNames) { values[i++] = data.get(keyName); } writeDocumentEntry(values); } /** {@inheritDoc} */ @Override public void writeDocumentEntry(String[] data) throws IOException { int i=0; for(String value : data) { if (value == null) value = ""; else if (value.length() > valueLensChars[i]) if (CROP_LONG) { value = value.substring(0,valueLensChars[i]-1); }else throw new IllegalArgumentException("CROP_LONG="+CROP_LONG+": Data ("+value+") of string length "+value.length()+" for key " +keyNames[i]+" exceeds max string length of " + valueLensChars[i] +"(byte length of " + valueLensBytes[i] + "). Crop in the Document, increase indexer.meta.forward.keylens, or set metaindex.compressed.crop.long"); byte[] b = Text.encode(value).array(); int numberOfBytesToWrite = b.length; while (numberOfBytesToWrite > valueLensBytes[i]) { if (CROP_LONG) { // we have reached an exception case, see http://terrier.org/issues/browse/TR-518 // incrementally shorten the value until it can be encoded // guess overfill double oversizeRatio = (1.0*valueLensBytes[i])/numberOfBytesToWrite; int newTargetLength = (int)(value.length()*oversizeRatio); value = value.substring(0,newTargetLength-1); b = Text.encode(value).array(); numberOfBytesToWrite = b.length; //logger.info("Extra cropping was applied, reducing text to length "+value.length()+" characters to fit in the target byte length "+numberOfBytesToWrite+"/"+valueLensBytes[i]); } else { throw new IllegalArgumentException("CROP_LONG="+CROP_LONG+": Data ('"+value+"') with "+value.length()+" characters and byte length "+numberOfBytesToWrite+" for key " +keyNames[i]+" exceeds max byte length of " + valueLensBytes[i] +"(string length of " + valueLensChars[i] + "). Crop in the Document, increase indexer.meta.forward.keylens, or set metaindex.compressed.crop.long"); } } baos.write(b); if (numberOfBytesToWrite < valueLensBytes[i]) baos.write(spaces, 0, valueLensBytes[i]-numberOfBytesToWrite); if (valuesSorted[i] && entryCount > 0 && lastValues[i].compareTo(value) >= 0) { if (logger.isDebugEnabled()) logger.debug( "docid " + entryCount + " key " + keyNames[i] + " value " + value + " it not lexicographically after " + lastValues[i] + " - key is not sorted"); valuesSorted[i] = false; } lastValues[i] = value; i++; } indexOutput.writeLong(currentOffset); currentOffset += writeData(baos.toByteArray()); currentIndexOffset += 8; baos.reset(); for(i=0;i MAX_MB_IN_MEM_RETRIEVAL * (long)1024 * (long)1024 ? "file" : "fileinmem"); index.setIndexProperty("index."+structureName+".index-source", currentIndexOffset > MAX_INDEX_MB_IN_MEM_RETRIEVAL* (long)1024 * (long)1024 ? "file" : "fileinmem"); index.flush(); for(var forwardWriter : reverseWriters) { forwardWriter.close(); } index.setIndexProperty("index."+structureName+".reverse-key-names", ArrayUtils.join(reverseKeyNames, ",")); index.flush(); logger.debug("Finished writing metaindex:" + " keys " + Arrays.toString(keyNames) + " keylens " + Arrays.toString(valueLensChars) + " sorted " + Arrays.toString(valuesSorted) + " data file size " + currentOffset); if (currentOffset > 0) { float uncompressedSize = (long) entryLengthBytes * (long) entryCount; float compressionRatio = uncompressedSize / (float) currentOffset; logger.info(this.getClass().getSimpleName() + " " + structureName + " achieved compression ratio " + compressionRatio + " (> 1 is better)"); if (compressionRatio < 1) { logger.info("Compression of metaindex actually increased file size; you might achieve reduced space consumption by using an uncompressed metaindex"); } } else { logger.info("Empty metaindex"); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy