
org.terrier.structures.IndexUtil Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is IndexUtil.java
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald (original contributor)
*/
package org.terrier.structures;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import com.google.gson.Gson;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.io.Writable;
import org.terrier.applications.CLITool;
import org.terrier.applications.CLITool.CLIParsedCLITool;
import org.terrier.querying.IndexRef;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
/**
* Class with handy utilities for use on an Index.
*
* @since 3.0
*/
public class IndexUtil {
public static class Command extends CLIParsedCLITool {
@Override
public String sourcepackage() {
return CLITool.PLATFORM_MODULE;
}
@Override
public String commandname() {
return "indexutil";
}
@Override
public String helpsummary() {
return "utilities for displaying the content of an index";
}
@Override
@SuppressWarnings({ "unchecked", "deprecation" })
public int run(CommandLine line) throws Exception {
IndexRef iRef = IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
if (line.hasOption("I")) {
String indexLocation = line.getOptionValue("I");
iRef = IndexRef.of(indexLocation);
}
PropertiesIndex.setIndexLoadingProfileAsRetrieval(false);
// load the index
final Index index = IndexFactory.of(iRef);
if (index == null) {
System.err.println("Index for ref "+iRef+" not found: " + IndexOnDisk.getLastIndexLoadError());
return 2;
}
// command loop
if (line.hasOption("printpostingfile")) {
String structureName = "inverted";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
PostingIndexInputStream bpiis = (PostingIndexInputStream) index
.getIndexStructureInputStream(structureName);
bpiis.print();
bpiis.close();
} else if (line.hasOption("printterm")) {
IndexUtil.forceStructure(index, "document", new DocumentIndex() {
@Override
public int getNumberOfDocuments() {
return index.getCollectionStatistics().getNumberOfDocuments();
}
@Override
public int getDocumentLength(int docid) throws IOException {
return 0;
}
@Override
public DocumentIndexEntry getDocumentEntry(int docid) throws IOException {
return null;
}
});
String structureName = "lexicon";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
String term = line.getOptionValue("printterm");
Lexicon lex = (Lexicon) index.getIndexStructure(structureName);
PostingIndex> inv = (PostingIndex>) index.getInvertedIndex();
LexiconEntry le = lex.getLexiconEntry(term);
if (le == null) {
System.err.println("Term " + term + " not found");
return -1;
}
IterablePosting ip = inv.getPostings(le);
while (ip.next() != IterablePosting.EOL) {
System.out.print(ip.toString());
System.out.println(" ");
}
ip.close();
lex.close();
close(inv);
} else if (line.hasOption("printposting")) {
IndexUtil.forceStructure(index, "document", new DocumentIndex() {
@Override
public int getNumberOfDocuments() {
return index.getCollectionStatistics().getNumberOfDocuments();
}
@Override
public int getDocumentLength(int docid) throws IOException {
return 0;
}
@Override
public DocumentIndexEntry getDocumentEntry(int docid) throws IOException {
return null;
}
});
String[] localArgs = line.getOptionValues("printposting");
if (localArgs.length != 2) {
System.err.println("Usage: --printposting ");
return -1;
}
Lexicon lex = index.getLexicon();
PostingIndex inv = (PostingIndex) index.getInvertedIndex();
LexiconEntry le = lex.getLexiconEntry(localArgs[0]);
IterablePosting ip = inv.getPostings(le);
int targetId = Integer.parseInt(localArgs[1]);
int foundId = ip.next(targetId);
if (foundId == targetId) {
System.out.println(ip.toString());
} else {
System.err.println(
"Docid " + targetId + " not found for term " + localArgs[0] + " (nearest was " + foundId + ")");
return -1;
}
ip.close();
lex.close();
close(inv);
// } else if (line.hasOption("printbitentry")) {
// String structureName = "inverted";
// if (line.hasOption("s"))
// structureName = line.getOptionValue("s");
// List pointerList = (List) index.getIndexStructure();
// PostingIndex> bpi = (PostingIndex>) index.getIndexStructure(structureName);
// // for every docid on cmdline
// for (String arg: args) {
// BitIndexPointer pointer = pointerList.get(Integer.parseInt(arg));
// if (pointer.getNumberOfEntries() == 0)
// continue;
// System.out.print(arg + " ");
// IterablePosting ip = bpi.getPostings(pointer);
// while (ip.next() != IterablePosting.EOL) {
// System.out.print(ip.toString());
// System.out.print(" ");
// }
// System.out.println();
// }
} else if (line.hasOption("printlex")) {
String structureName = "lexicon";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
LexiconUtil.printLexicon(index, structureName);
} else if (line.hasOption("printdocument")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
printDocumentIndex(index, structureName);
} else if (line.hasOption("printlist")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
Iterator extends Writable> in = (Iterator extends Writable>) index
.getIndexStructureInputStream(structureName);
while (in.hasNext()) {
System.out.println(in.next().toString());
}
IndexUtil.close(in);
} else if (line.hasOption("printlistentry")) {
String structureName = "document";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
List extends Writable> list = (List extends Writable>) index.getIndexStructure(structureName);
for(String arg : line.getOptionValues("printlistentry"))
{
System.out.println(list.get(Integer.parseInt(arg)).toString());
}
IndexUtil.close(list);
} else if (line.hasOption("printmeta")) {
boolean json = line.hasOption("j");
String structureName = "meta";
if (line.hasOption("s"))
structureName = line.getOptionValue("s");
if (json)
printMetaIndexJson(index, structureName);
else
printMetaIndex(index, structureName);
} else {
System.err.println(super.help());
}
index.close();
return 0;
}
@Override
protected Options getOptions() {
Options opts = super.getOptions();
opts.addOption(Option.builder()
.longOpt("printmeta")
.desc("display contents of a meta index")
.build());
opts.addOption(Option.builder()
.longOpt("printdocument")
.desc("display contents of a document index")
.build());
opts.addOption(Option.builder()
.longOpt("printlex")
.desc("display contents of a lexicon index")
.build());
opts.addOption(Option.builder()
.longOpt("printlist")
.desc("display contents of a list index structure, such as a document index")
.build());
opts.addOption(Option.builder()
.longOpt("printlistentry")
.desc("display one entry in a list index structure, such as an entry in a document index")
.hasArgs()
.build());
//opts.addOption(Option.builder().longOpt("printbitentry").build());
opts.addOption(Option.builder()
.longOpt("printposting")
.desc("display posting for specific term and specified docid")
.hasArgs()
.build());
opts.addOption(Option.builder()
.longOpt("printpostingfile")
.desc("Display contents of a posting file (e.g. inverted or direct)")
.build());
opts.addOption(Option.builder()
.longOpt("printterm")
.hasArg()
.desc("Display contents of a posting list for one term")
.build());
opts.addOption(Option.builder("s")
.longOpt("structure")
.desc("Change the name structure being operated on")
.hasArg(true)
.build());
opts.addOption(Option.builder("j")
.desc("Display output in JSON (only for printmeta)")
.longOpt("json")
.build());
return opts;
}
}
/** Has some handy utilities for printing various index structures to System.out, such as:
*
* - --printbitfile - print the bit file with the specified name
* - --printbitentry - print one entry in a bit posting index.
* - --printlex - print the entire lexicon
* - --printdocument - print the document index
* - --printlist - print the named list (e.g. document index)
* - --printmeta - print the meta index
*
* See bin/terrier help indexutils for more
*/
public static void main(String[] args) {
CLITool.run(Command.class, args);
}
/** Force the specified object into the structure cache of the specified object,
* as the given structure name
* @param index Index to operate on
* @param structureName which structure name to use
* @param structure which object to put into the structure cache
*/
public static void forceStructure(Index index, String structureName, Object structure)
{
((IndexOnDisk) index).structureCache.put(structureName, structure);
}
/** Forces a structure to be reloaded, by removing it from the index's structure cache */
public static void forceReloadStructure(Index index, String structureName)
{
((IndexOnDisk) index).structureCache.remove(structureName);
}
public static IndexOnDisk reOpenIndex(Index index) throws IOException
{
return reOpenIndex((IndexOnDisk) index);
}
/** Reopen an existing index */
public static IndexOnDisk reOpenIndex(IndexOnDisk index) throws IOException
{
IndexOnDisk rtr = null;
String path = index.getPath();
String prefix = index.getPrefix();
index.close();
rtr = IndexOnDisk.createIndex(path, prefix);
return rtr;
}
/** Returns a list of the structures in the given index */
public static String[] getStructures(PropertiesIndex index)
{
List rtr = new ArrayList();
for(Object o : index.getProperties().keySet())
{
String key = (String)o;
if (key.matches("index\\..+\\.class"))
{
key = key.replaceAll("index.", "");
key = key.replaceFirst(".class", "");
rtr.add(key);
}
}
return rtr.toArray(new String[0]);
}
/** Move an index from one location to another */
public static void renameIndex(String srcPath, String srcPrefix, String dstPath, String dstPrefix)
throws IOException
{
final String actualPrefix = srcPrefix +'.';
for (String filename : Files.list(srcPath))
{
if (filename.startsWith(actualPrefix))
{
final String newFilename = filename.replaceFirst(srcPrefix, dstPrefix);
if (! Files.rename(srcPath + "/" + filename, dstPath+"/"+ newFilename))
{
final String srcExists = Files.exists(srcPath + "/" + filename) ? "exists" : "notexists";
final String destExists = Files.exists(dstPath+"/"+ newFilename) ? "exists" : "notexists";
throw new IOException("Rename of index structure file '"+srcPath + "/" + filename+"' ("+srcExists+") to " +
"'"+ dstPath+"/"+ newFilename +"' ("+destExists+") failed - likely that source file is still open. " +
"Possible indexing bug?");
}
}
}
}
/** Delete an existing index */
public static void deleteIndex(String path, String prefix)
throws IOException
{
final String actualPrefix = prefix +'.';
String[] files = Files.list(path);
if (files == null)
return;
for (String filename : files)
{
if (filename.startsWith(actualPrefix))
{
Files.delete(path + "/" + filename);
}
}
}
/** Print the contents of the document index */
@SuppressWarnings("unchecked")
public static void printDocumentIndex(Index index, String structureName) throws IOException
{
Iterator iterator = (Iterator)index.getIndexStructureInputStream(structureName);
int docid =0;
while(iterator.hasNext())
{
DocumentIndexEntry die = iterator.next();
System.out.println(docid +": " + die.toString());
docid++;
}
close(iterator);
}
/** Delete the named structure from the specified index.
* Deletes files as well.
* @param index - index to operate on
* @param structureName name of structure to delete
* @return true if structure was found and deleted, false otherwise
*/
public static boolean deleteStructure(IndexOnDisk index, String structureName) throws IOException
{
boolean found = false;
List toRemove = new ArrayList();
for(Object o : index.getProperties().keySet())
{
String key = (String)o;
if (key.startsWith("index."+structureName + "."))
{
toRemove.add(key);
found = true;
}
}
for(String key : toRemove)
index.getProperties().remove(key);
for(String file : Files.list(((IndexOnDisk) index).getPath()))
{
if (file.startsWith(((IndexOnDisk) index).getPrefix() + "." + structureName + "."))
{
Files.delete(((IndexOnDisk) index).getPath() + "/" + file);
}
}
return found;
}
/** Checks the underlying structurecache of the specificed index to see if the
* named index structure is there.
* @param index index to examine
* @param structureName what structure
* @return true if the structure cache contains the item
*/
public static boolean isStructureOpen(IndexOnDisk index, String structureName) {
return index.structureCache.containsKey(structureName);
}
/** Copies an index structure from one index to another.
* @param sourceIndex
* @param destIndex
* @param sourceStructureName
* @param destinationStructureName
* @throws IOException if an IO problem occurs
*/
public static boolean copyStructure(IndexOnDisk sourceIndex, IndexOnDisk destIndex, String sourceStructureName, String destinationStructureName) throws IOException
{
boolean found = false;
/* if source and destination index as the same, then a ConcurrentModificationException
* will occur if we try to alter the Properties table while the iteration is taking place.
* Hence, to prevent this, we create a temporary Properties, put new properties to that,
* then apply all new properties back on the source(dest) index */
final boolean sameIndex = sourceIndex == destIndex;
// use temporary index as destination if sameIndex
Properties destProperties = sameIndex ? new Properties() : destIndex.properties;
for(Object o : sourceIndex.getProperties().keySet())
{
String key = (String)o;
if (key.startsWith("index."+sourceStructureName + "."))
{
destProperties.setProperty(
key.replaceFirst("^index\\."+sourceStructureName + "\\.",
"index." + destinationStructureName + "."), sourceIndex.getProperties().getProperty(key));
found = true;
destIndex.dirtyProperties = true;
}
}
//copy new properties to real index
if (sameIndex)
{
for(Object o : destProperties.keySet())
{
String key = (String)o;
sourceIndex.setIndexProperty((String)o, destProperties.getProperty(key, null));
}
}
//copy
for(String file : Files.list(((IndexOnDisk) sourceIndex).getPath()))
{
if (file.startsWith(((IndexOnDisk) sourceIndex).getPrefix() + "." + sourceStructureName + "."))
{
Files.copyFile(
((IndexOnDisk) sourceIndex).getPath() + "/" + file,
((IndexOnDisk) destIndex).getPath() + "/" + file.replaceFirst(
((IndexOnDisk) sourceIndex).getPrefix() + "\\." + sourceStructureName,
((IndexOnDisk) destIndex).getPrefix() + "." + sourceStructureName));
}
}
return found;
}
/** Print the contents of the meta index */
@SuppressWarnings("unchecked")
public static void printMetaIndex(Index index, String structureName) throws IOException
{
Iterator inputStream = (Iterator)index.getIndexStructureInputStream(structureName);
while(inputStream.hasNext())
{
System.out.println(ArrayUtils.join(inputStream.next(), ", "));
}
IndexUtil.close(inputStream);
}
@SuppressWarnings("unchecked")
public static void printMetaIndexJson(Index index, String structureName) throws IOException
{
//this is expensive
final String[] keys = index.getMetaIndex().getKeys();
final int K = keys.length;
Iterator inputStream = (Iterator)index.getIndexStructureInputStream(structureName);
while(inputStream.hasNext())
{
System.out.print("{");
String[] values = inputStream.next();
for (int i=0;i toRemove = new HashSet();
Map toAdd = new HashMap();
for(Object o : p.keySet())
{
String key = (String)o;
if (key.startsWith("index."+sourceStructureName + "."))
{
toAdd.put(
key.replaceFirst("index."+sourceStructureName + "\\.",
"index." + destinationStructureName + "."),
p.getProperty(key));
toRemove.add(key);
//System.err.println("new key is " + key.replaceFirst("index."+sourceStructureName + "\\.",
// "index." + destinationStructureName + "."));
}
if (key.startsWith("index."+sourceStructureName + "-inputstream."))
{
toAdd.put(
key.replaceFirst("index."+sourceStructureName + "-inputstream\\.",
"index." + destinationStructureName + "-inputstream."),
p.getProperty(key));
toRemove.add(key);
}
}
boolean OK = false;
for(String k : toRemove)
{
//System.err.println("Removing property " + k);
p.remove(k);
}
for(Map.Entry e : toAdd.entrySet())
{
//System.err.println("Setting property " + e.getKey());
p.setProperty(e.getKey(), e.getValue());
OK = true;
}
index.dirtyProperties = true;
index.flush();
return OK;
}
/** Configures an object with the index, if the object implements IndexConfigurable */
public static void configure(Index index, Object o)
{
if (o instanceof IndexConfigurable)
{
((IndexConfigurable)o).setIndex(index);
}
}
/** Check to see if an object is closeable, and if so, close it. Propagate
* any exception thrown.
* @param o object to check for being closeable.
* @throws IOException if exception thrown while closing.
*/
public static void close(Object o) throws IOException
{
if (o instanceof Closeable)
((Closeable)o).close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy