org.terrier.structures.IndexUtil Maven / Gradle / Ivy
The newest version!
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
* The Original Code is IndexUtil.java
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
* Contributor(s):
* Craig Macdonald (original contributor)
package org.terrier.structures;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import com.google.gson.Gson;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.io.Writable;
import org.terrier.applications.CLITool;
import org.terrier.applications.CLITool.CLIParsedCLITool;
import org.terrier.querying.IndexRef;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
* Class with handy utilities for use on an Index.
* @since 3.0
public class IndexUtil {
public static class Command extends CLIParsedCLITool {
public String sourcepackage() {
public String commandname() {
return "indexutil";
public String helpsummary() {
return "utilities for displaying the content of an index";
@SuppressWarnings({ "unchecked", "deprecation" })
public int run(CommandLine line) throws Exception {
IndexRef iRef = IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
if (line.hasOption("I")) {
String indexLocation = line.getOptionValue("I");
iRef = IndexRef.of(indexLocation);
// load the index
final Index index = IndexFactory.of(iRef);
if (index == null) {
System.err.println("Index for ref "+iRef+" not found: " + IndexOnDisk.getLastIndexLoadError());
return 2;
// command loop
if (line.hasOption("printpostingfile")) {
String structureName = "inverted";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
PostingIndexInputStream bpiis = (PostingIndexInputStream) index
} else if (line.hasOption("printterm")) {
IndexUtil.forceStructure(index, "document", new DocumentIndex() {
public int getNumberOfDocuments() {
return index.getCollectionStatistics().getNumberOfDocuments();
public int getDocumentLength(int docid) throws IOException {
return 0;
public DocumentIndexEntry getDocumentEntry(int docid) throws IOException {
return null;
String structureName = "lexicon";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
String term = line.getOptionValue("printterm");
Lexicon lex = (Lexicon) index.getIndexStructure(structureName);
PostingIndex> inv = (PostingIndex>) index.getInvertedIndex();
LexiconEntry le = lex.getLexiconEntry(term);
if (le == null) {
System.err.println("Term " + term + " not found");
return -1;
IterablePosting ip = inv.getPostings(le);
while (ip.next() != IterablePosting.EOL) {
System.out.println(" ");
} else if (line.hasOption("printposting")) {
IndexUtil.forceStructure(index, "document", new DocumentIndex() {
public int getNumberOfDocuments() {
return index.getCollectionStatistics().getNumberOfDocuments();
public int getDocumentLength(int docid) throws IOException {
return 0;
public DocumentIndexEntry getDocumentEntry(int docid) throws IOException {
return null;
String[] localArgs = line.getOptionValues("printposting");
if (localArgs.length != 2) {
System.err.println("Usage: --printposting ");
return -1;
Lexicon lex = index.getLexicon();
PostingIndex inv = (PostingIndex) index.getInvertedIndex();
LexiconEntry le = lex.getLexiconEntry(localArgs[0]);
IterablePosting ip = inv.getPostings(le);
int targetId = Integer.parseInt(localArgs[1]);
int foundId = ip.next(targetId);
if (foundId == targetId) {
} else {
"Docid " + targetId + " not found for term " + localArgs[0] + " (nearest was " + foundId + ")");
return -1;
// } else if (line.hasOption("printbitentry")) {
// String structureName = "inverted";
// if (line.hasOption("s"))
// structureName = line.getOptionValue("s");
// List pointerList = (List) index.getIndexStructure();
// PostingIndex> bpi = (PostingIndex>) index.getIndexStructure(structureName);
// // for every docid on cmdline
// for (String arg: args) {
// BitIndexPointer pointer = pointerList.get(Integer.parseInt(arg));
// if (pointer.getNumberOfEntries() == 0)
// continue;
// System.out.print(arg + " ");
// IterablePosting ip = bpi.getPostings(pointer);
// while (ip.next() != IterablePosting.EOL) {
// System.out.print(ip.toString());
// System.out.print(" ");
// }
// System.out.println();
// }
} else if (line.hasOption("printlex")) {
String structureName = "lexicon";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
LexiconUtil.printLexicon(index, structureName);
} else if (line.hasOption("printdocument")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
printDocumentIndex(index, structureName);
} else if (line.hasOption("printlist")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
Iterator extends Writable> in = (Iterator extends Writable>) index
while (in.hasNext()) {
} else if (line.hasOption("printlistentry")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
List extends Writable> list = (List extends Writable>) index.getIndexStructure(structureName);
for(String arg : line.getOptionValues("printlistentry"))
} else if (line.hasOption("printmeta")) {
boolean json = line.hasOption("j");
String structureName = "meta";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
if (json)
printMetaIndexJson(index, structureName);
printMetaIndex(index, structureName);
} else {
return 0;
protected Options getOptions() {
Options opts = super.getOptions();
.desc("display contents of a meta index")
.desc("display contents of a document index")
.desc("display contents of a lexicon index")
.desc("display contents of a list index structure, such as a document index")
.desc("display one entry in a list index structure, such as an entry in a document index")
.desc("display posting for specific term and specified docid")
.desc("Display contents of a posting file (e.g. inverted or direct)")
.desc("Display contents of a posting list for one term")
.desc("Change the name structure being operated on")
.desc("Display output in JSON (only for printmeta)")
return opts;
/** Has some handy utilities for printing various index structures to System.out, such as:
* - --printbitfile - print the bit file with the specified name
* - --printbitentry - print one entry in a bit posting index.
* - --printlex - print the entire lexicon
* - --printdocument - print the document index
* - --printlist - print the named list (e.g. document index)
* - --printmeta - print the meta index
* See bin/terrier help indexutils for more
public static void main(String[] args) {
CLITool.run(Command.class, args);
/** Force the specified object into the structure cache of the specified object,
* as the given structure name
* @param index Index to operate on
* @param structureName which structure name to use
* @param structure which object to put into the structure cache
public static void forceStructure(Index index, String structureName, Object structure)
((IndexOnDisk) index).structureCache.put(structureName, structure);
/** Forces a structure to be reloaded, by removing it from the index's structure cache */
public static void forceReloadStructure(Index index, String structureName)
((IndexOnDisk) index).structureCache.remove(structureName);
public static IndexOnDisk reOpenIndex(Index index) throws IOException
return reOpenIndex((IndexOnDisk) index);
/** Reopen an existing index */
public static IndexOnDisk reOpenIndex(IndexOnDisk index) throws IOException
IndexOnDisk rtr = null;
String path = index.getPath();
String prefix = index.getPrefix();
rtr = IndexOnDisk.createIndex(path, prefix);
return rtr;
/** Returns a list of the structures in the given index */
public static String[] getStructures(PropertiesIndex index)
List rtr = new ArrayList();
for(Object o : index.getProperties().keySet())
String key = (String)o;
if (key.matches("index\\..+\\.class"))
key = key.replaceAll("index.", "");
key = key.replaceFirst(".class", "");
return rtr.toArray(new String[0]);
/** Move an index from one location to another */
public static void renameIndex(String srcPath, String srcPrefix, String dstPath, String dstPrefix)
throws IOException
final String actualPrefix = srcPrefix +'.';
for (String filename : Files.list(srcPath))
if (filename.startsWith(actualPrefix))
final String newFilename = filename.replaceFirst(srcPrefix, dstPrefix);
if (! Files.rename(srcPath + "/" + filename, dstPath+"/"+ newFilename))
final String srcExists = Files.exists(srcPath + "/" + filename) ? "exists" : "notexists";
final String destExists = Files.exists(dstPath+"/"+ newFilename) ? "exists" : "notexists";
throw new IOException("Rename of index structure file '"+srcPath + "/" + filename+"' ("+srcExists+") to " +
"'"+ dstPath+"/"+ newFilename +"' ("+destExists+") failed - likely that source file is still open. " +
"Possible indexing bug?");
/** Delete an existing index */
public static void deleteIndex(String path, String prefix)
throws IOException
final String actualPrefix = prefix +'.';
String[] files = Files.list(path);
if (files == null)
for (String filename : files)
if (filename.startsWith(actualPrefix))
Files.delete(path + "/" + filename);
/** Print the contents of the document index */
public static void printDocumentIndex(Index index, String structureName) throws IOException
Iterator iterator = (Iterator)index.getIndexStructureInputStream(structureName);
int docid =0;
DocumentIndexEntry die = iterator.next();
System.out.println(docid +": " + die.toString());
/** Delete the named structure from the specified index.
* Deletes files as well.
* @param index - index to operate on
* @param structureName name of structure to delete
* @return true if structure was found and deleted, false otherwise
public static boolean deleteStructure(IndexOnDisk index, String structureName) throws IOException
boolean found = false;
List toRemove = new ArrayList();
for(Object o : index.getProperties().keySet())
String key = (String)o;
if (key.startsWith("index."+structureName + "."))
found = true;
for(String key : toRemove)
for(String file : Files.list(((IndexOnDisk) index).getPath()))
if (file.startsWith(((IndexOnDisk) index).getPrefix() + "." + structureName + "."))
Files.delete(((IndexOnDisk) index).getPath() + "/" + file);
return found;
/** Checks the underlying structurecache of the specificed index to see if the
* named index structure is there.
* @param index index to examine
* @param structureName what structure
* @return true if the structure cache contains the item
public static boolean isStructureOpen(IndexOnDisk index, String structureName) {
return index.structureCache.containsKey(structureName);
/** Copies an index structure from one index to another.
* @param sourceIndex
* @param destIndex
* @param sourceStructureName
* @param destinationStructureName
* @throws IOException if an IO problem occurs
public static boolean copyStructure(IndexOnDisk sourceIndex, IndexOnDisk destIndex, String sourceStructureName, String destinationStructureName) throws IOException
boolean found = false;
/* if source and destination index as the same, then a ConcurrentModificationException
* will occur if we try to alter the Properties table while the iteration is taking place.
* Hence, to prevent this, we create a temporary Properties, put new properties to that,
* then apply all new properties back on the source(dest) index */
final boolean sameIndex = sourceIndex == destIndex;
// use temporary index as destination if sameIndex
Properties destProperties = sameIndex ? new Properties() : destIndex.properties;
for(Object o : sourceIndex.getProperties().keySet())
String key = (String)o;
if (key.startsWith("index."+sourceStructureName + "."))
key.replaceFirst("^index\\."+sourceStructureName + "\\.",
"index." + destinationStructureName + "."), sourceIndex.getProperties().getProperty(key));
found = true;
destIndex.dirtyProperties = true;
//copy new properties to real index
if (sameIndex)
for(Object o : destProperties.keySet())
String key = (String)o;
sourceIndex.setIndexProperty((String)o, destProperties.getProperty(key, null));
for(String file : Files.list(((IndexOnDisk) sourceIndex).getPath()))
if (file.startsWith(((IndexOnDisk) sourceIndex).getPrefix() + "." + sourceStructureName + "."))
((IndexOnDisk) sourceIndex).getPath() + "/" + file,
((IndexOnDisk) destIndex).getPath() + "/" + file.replaceFirst(
((IndexOnDisk) sourceIndex).getPrefix() + "\\." + sourceStructureName,
((IndexOnDisk) destIndex).getPrefix() + "." + sourceStructureName));
return found;
/** Print the contents of the meta index */
public static void printMetaIndex(Index index, String structureName) throws IOException
Iterator inputStream = (Iterator)index.getIndexStructureInputStream(structureName);
System.out.println(ArrayUtils.join(inputStream.next(), ", "));
public static void printMetaIndexJson(Index index, String structureName) throws IOException
//this is expensive
final String[] keys = index.getMetaIndex().getKeys();
final int K = keys.length;
Iterator inputStream = (Iterator)index.getIndexStructureInputStream(structureName);
String[] values = inputStream.next();
for (int i=0;i toRemove = new HashSet();
Map toAdd = new HashMap();
for(Object o : p.keySet())
String key = (String)o;
if (key.startsWith("index."+sourceStructureName + "."))
key.replaceFirst("index."+sourceStructureName + "\\.",
"index." + destinationStructureName + "."),
//System.err.println("new key is " + key.replaceFirst("index."+sourceStructureName + "\\.",
// "index." + destinationStructureName + "."));
if (key.startsWith("index."+sourceStructureName + "-inputstream."))
key.replaceFirst("index."+sourceStructureName + "-inputstream\\.",
"index." + destinationStructureName + "-inputstream."),
boolean OK = false;
for(String k : toRemove)
//System.err.println("Removing property " + k);
for(Map.Entry e : toAdd.entrySet())
//System.err.println("Setting property " + e.getKey());
p.setProperty(e.getKey(), e.getValue());
OK = true;
index.dirtyProperties = true;
return OK;
/** Configures an object with the index, if the object implements IndexConfigurable */
public static void configure(Index index, Object o)
if (o instanceof IndexConfigurable)
/** Check to see if an object is closeable, and if so, close it. Propagate
* any exception thrown.
* @param o object to check for being closeable.
* @throws IOException if exception thrown while closing.
public static void close(Object o) throws IOException
if (o instanceof Closeable)
© 2015 - 2025 Weber Informatics LLC | Privacy Policy