
org.terrier.structures.indexing.FSOMapFileLexiconUtilities Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is FSOMapFileLexiconUtilities.java.
*
* The Original Code is Copyright (C) 2017-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald
*/
package org.terrier.structures.indexing;
import gnu.trove.TIntObjectHashMap;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.structures.FSOMapFileLexicon;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.collections.FSOrderedMapFile;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.Files;
import org.terrier.utility.io.WrappedIOException;
public class FSOMapFileLexiconUtilities {
public static final Logger logger = LoggerFactory.getLogger(FSOMapFileLexiconUtilities.class);
/**
* optimise
* @param structureName
* @param index
* @param statsCounter
* @param numEntries
* @throws IOException
*/
@SuppressWarnings({"unchecked", "resource"})
public static void optimise(
String structureName,
IndexOnDisk index,
LexiconBuilder.CollectionStatisticsCounter statsCounter,
int numEntries)
throws IOException
{
final String mapFileFilename = FSOMapFileLexicon.constructFilename(structureName, index.getPath(), index.getPrefix(), FSOMapFileLexicon.MAPFILE_EXT);
final FixedSizeWriteableFactory keyFactory =
(FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-keyfactory");
final FixedSizeWriteableFactory valueFactory =
(FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-valuefactory");
logger.info("Optimising lexicon with "+ numEntries + " entries");
//term id lookups
boolean termIdsAligned = true;
int[] termid2index = new int[numEntries];
Arrays.fill(termid2index, -1);
int counter= 0; int lastTermId = -1;
//bsearch reduction
int previousFirstChar = -1;
int firstChar = 0;
final TIntObjectHashMap map = new TIntObjectHashMap();
Iterator> iterator =
new FSOrderedMapFile.EntryIterator(mapFileFilename, keyFactory, valueFactory);
Map.Entry lee = null;
int termId = Integer.MIN_VALUE;
try {
while(iterator.hasNext())
{
lee = iterator.next();
//System.err.println(lee.toString());
//System.err.println(lee.toString() +" "+lee.getValue().getTermId()+" "+lee.getValue().getFrequency());
//term id
termId = lee.getValue().getTermId();
if (! (termId == lastTermId+1))
termIdsAligned = false;
if (termid2index[termId] != -1)
{
throw new WrappedIOException(new IllegalArgumentException("Termid " + termId + " is not unique - used at entries " +termid2index[termId]+ " and" + counter));
}
termid2index[termId] = counter;
lastTermId = termId;
//bsearch reduction optimisaion
firstChar = lee.getKey().charAt(0);
if (firstChar!=previousFirstChar) {
int[] boundaries = new int[] {counter, 0};
map.put(firstChar, boundaries);
previousFirstChar = firstChar;
}
//increments
statsCounter.count(lee.getValue());
counter++;
}
} catch (ArrayIndexOutOfBoundsException ae) {
logger.error("Termid " + termId + " is too large (expected only "
+termid2index.length +" entries). Bad lexicon entry is: "
+ lee.getKey().toString() + " -> " + lee.getValue().toString() );
throw ae;
}
if (counter != numEntries)
termIdsAligned = false;
IndexUtil.close(iterator);
//deal with termids
if (termIdsAligned)
{
index.setIndexProperty("index."+structureName+".termids", "aligned");
logger.info("All ids for structure "+structureName+ " are aligned, skipping "
+FSOMapFileLexicon.ID_EXT+ " file");
}
else
{
DataOutputStream dos = new DataOutputStream(Files.writeFileStream(
FSOMapFileLexicon.constructFilename(structureName, ((IndexOnDisk) index).getPath(), ((IndexOnDisk) index).getPrefix(), FSOMapFileLexicon.ID_EXT)));
for(int indexof : termid2index)
dos.writeInt(indexof);
dos.close();
index.setIndexProperty("index."+structureName+".termids", (numEntries > 15000000) ? "file" : "fileinmem");
}
int[] mapKeys = map.keys();
Arrays.sort(mapKeys);
final int mapKeysSize = mapKeys.length;
for (int i=0; i keyFactory =
(FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-keyfactory");
final FixedSizeWriteableFactory valueFactory =
(FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-valuefactory");
final int numEntries = FSOrderedMapFile.numberOfEntries(mapFileFilename, keyFactory, valueFactory);
optimise(structureName, index, statsCounter, numEntries);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy