
org.terrier.structures.IndexOnDisk Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is IndexOnDisk.java
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald (original contributor)
* Richard McCreadie
* Stuart Mackie
*/
package org.terrier.structures;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import org.terrier.Version;
import org.terrier.querying.IndexRef;
import org.terrier.structures.IndexFactory.IndexLoader;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.restructure.Terrier4;
import org.terrier.utility.restructure.Terrier5;
/**
* The replacement for what was Index in earlier Terrier versions.
* Represents the most common type of index, i.e. one which is stored
* on disk.
* @author Stuart Mackie, Craig Macdonald, Richard McCreadie
* @since 4.0
*/
public class IndexOnDisk extends PropertiesIndex {
public static class DiskIndexLoader implements IndexLoader
{
@Override
public boolean supports(IndexRef ref) {
String l = ref.toString();
if (ref.size() > 1)
return false; //this is a multi-index
if (l.startsWith("http") || l.startsWith("https") || l.startsWith("concurrent"))
return false;
//a normal indexref should point to a data.properties file
if (l.endsWith(".properties"))
return Files.exists(l);
// but we also support indexrefs pointing just to a directory containing an index
return Files.exists(l + "/data.properties");
}
@Override
public Index load(IndexRef ref) {
String l = ref.toString();
File file = new File(l);
String path, prefix;
if (l.endsWith(".properties"))
{
path = file.getParent();
prefix = file.getName().replace(".properties", "");
} else {
path = l;
prefix = "data";
}
return IndexOnDisk.createIndex(path, prefix);
}
@Override
public Class extends Index> indexImplementor(IndexRef ref) {
return IndexOnDisk.class;
}
}
/** path component of this index's location */
protected String path;
/** prefix component of this index's location */
protected String prefix;
/** Cache of all opened index structures, but not input streams */
protected final HashMap structureCache = new HashMap(
10);
/** Set to true if loading an index succeeds */
protected boolean loadSuccess = true;
protected String loadError = null;
/**
* A default constructor that creates an instance of the index.
*/
protected IndexOnDisk() {
this(ApplicationSetup.TERRIER_INDEX_PATH,
ApplicationSetup.TERRIER_INDEX_PREFIX, false);
}
/**
* Constructs a new Index object. Don't call this method, call the
* createIndex(String) factory method to construct an Index object.
*
* @param _path
* String the path in which the data structures will be created.
* @param _prefix
* String the prefix of the files to be created.
* @param isNew
* where a new Index should be created if there is no index at
* the specified location
*/
protected IndexOnDisk(String _path, String _prefix, boolean isNew) {
super(0l, 0l, 0l);
if (!(new File(_path)).isAbsolute())
_path = ApplicationSetup.makeAbsolute(_path,
ApplicationSetup.TERRIER_VAR);
this.path = _path;
this.prefix = _prefix;
if (isNew && (! Files.exists(this.path) ))
{
String message = "Cannot create new index: path " + this.path + " does not exist, or cannot be written to";
logger.error(message);
throw new IllegalArgumentException(message);
}
boolean indexExists = loadProperties();
if (isNew && !indexExists) {
logger.debug("Creating new index : " + this.toString());
setIndexProperty("index.terrier.version",
ApplicationSetup.TERRIER_VERSION);
setIndexProperty("index.created", "" + System.currentTimeMillis());
setIndexProperty("num.Documents", "0");
setIndexProperty("num.Terms", "0");
setIndexProperty("num.Tokens", "0");
setIndexProperty("num.Pointers", "0");
loadUpdatingStatistics();
dirtyProperties = true;
loadSuccess = true;
} else if (indexExists) {
logger.debug("Loading existing index : " + this.toString());
// note the order - some structures will require collection
// statistics, so load this first.
loadStatistics();
loadIndices();
}
}
public IndexOnDisk(long l, long m, long n) {
super(0l, 0l, 0l);
}
@Override
public void close() throws IOException {
// invoke the close methods on all currently open index structures
for (Object o : structureCache.values()) {
try {
IndexUtil.close(o);
} catch (IOException ioe) {/* ignore */
}
}
structureCache.clear();
flushProperties();
}
@Override
public void flush() throws IOException {
flushProperties();
}
/** Write any dirty properties down to disk */
protected void flushProperties() throws IOException {
if (dirtyProperties) {
final String propertiesFilename = path
+ ApplicationSetup.FILE_SEPARATOR + prefix
+ PROPERTIES_SUFFIX;
if ((Files.exists(propertiesFilename) && !Files
.canWrite(propertiesFilename))
|| (!Files.exists(propertiesFilename) && !Files
.canWrite(path))) {
logger.warn("Could not write to index properties at "
+ propertiesFilename
+ " because you do not have write permission on the index - some changes may be lost");
return;
}
final OutputStream outputStream = Files
.writeFileStream(propertiesFilename);
properties.store(outputStream, this.toString());
outputStream.close();
dirtyProperties = false;
}
}
@Override
public CollectionStatistics getCollectionStatistics() {
return (CollectionStatistics) getIndexStructure("collectionstatistics");
}
@SuppressWarnings("unchecked")
@Override
public PostingIndex getDirectIndex() {
return (PostingIndex) getIndexStructure("direct");
}
@Override
public DocumentIndex getDocumentIndex() {
return (DocumentIndex) getIndexStructure("document");
}
@SuppressWarnings("unchecked")
public T getIndexStructure(String structureName, Class extends T> clazz) {
return (T)getIndexStructure(structureName);
}
/**
* Obtains the named index structure, using an already loaded one if
* possible.
*
* @param structureName
* name of the required structure
* @return desired object or null if not found
*/
public Object getIndexStructure(String structureName) {
Object rtr = structureCache.get(structureName);
if (rtr != null)
return rtr;
rtr = loadIndexStructure(structureName);
if (rtr != null)
structureCache.put(structureName, rtr);
return rtr;
}
@SuppressWarnings("unchecked")
public T getIndexStructureInputStream(String structureName, Class extends T> clazz) {
return (T)getIndexStructureInputStream(structureName);
}
@Override
/** Return the input stream associated with the specified structure of this index
* @param structureName The name of the structure of which you want the inputstream. Eg "lexicon"
* @return Required structure, or null if not found */
public Object getIndexStructureInputStream(String structureName) {
// no caching on inputstreams
return loadIndexStructure(structureName + "-inputstream");
}
@Override
public PostingIndex> getInvertedIndex() {
return (PostingIndex>) getIndexStructure("inverted");
}
@SuppressWarnings("unchecked")
@Override
public Lexicon getLexicon() {
return (Lexicon) getIndexStructure("lexicon");
}
@Override
public MetaIndex getMetaIndex() {
return (MetaIndex) getIndexStructure("meta");
}
/** Returns the path of this index */
public String getPath() {
return path;
}
/** Returns the prefix of this index */
public String getPrefix() {
return prefix;
}
/**
* Load a new instance of the named index structure.
*
* @param structureName
* name of the required structure
* @return desired object or null if not found
*/
protected Object loadIndexStructure(String structureName) {
logger.debug("Attempting to load structure " + structureName);
try {
// figure out the correct class
String structureClassName = properties.getProperty("index."
+ structureName + ".class");
if (structureClassName == null) {
logger.error("This index (" + this.toString()
+ ") doesnt have an index structure called "
+ structureName + ": property index." + structureName
+ ".class not found");
logger.error("Valid structures are: "
+ Arrays.deepToString(IndexUtil.getStructures(this)));
return null;// TODO exceptions?
}
if (structureClassName.startsWith("uk.ac.gla.terrier"))
structureClassName = structureClassName.replaceAll(
"uk.ac.gla.terrier", "org.terrier");
// obtain the class definition for the index structure
Class> indexStructureClass = null;
try {
indexStructureClass = ApplicationSetup.getClass(structureClassName, false);
} catch (ClassNotFoundException cnfe) {
logger.error("ClassNotFoundException: This index ("
+ this.toString()
+ ") references an unknown index structure class: "
+ structureName + " looking for " + structureClassName);
cnfe.printStackTrace();
return null;// TODO exceptions?
}
// build up the constructor parameter type array
final ArrayList> paramTypes = new ArrayList>(5);
final String typeList = properties.getProperty(
"index." + structureName + ".parameter_types",
"java.lang.String,java.lang.String").trim();
Object rtr = null;
// for objects with constructor arguments
if (typeList.length() > 0) {
final String[] types = typeList.split("\\s*,\\s*");
for (String t : types) {
if (t.startsWith("uk.ac.gla.terrier"))
t = t.replaceAll("uk.ac.gla.terrier", "org.terrier");
paramTypes.add(ApplicationSetup.getClass(t));
}
Class>[] param_types = paramTypes.toArray(EMPTY_CLASS_ARRAY);
// build up the constructor parameter value array
String[] params = properties.getProperty(
"index." + structureName + ".parameter_values",
"path,prefix").split("\\s*,\\s*");
Object[] objs = new Object[paramTypes.size()];
int i = 0;
for (String p : params) {
// System.err.println("looking for parameter value called "+
// p + " with type '" + param_types[i]+ "'");
if (p.equals("path"))
objs[i] = path;
else if (p.equals("prefix"))
objs[i] = prefix;
else if (p.equals("index"))
objs[i] = this;
else if (p.equals("structureName")) {
final String tmp = structureName;
objs[i] = tmp.replaceAll("-inputstream$", "");
} else if (param_types[i].equals(java.lang.Class.class)) {
// System.err.println("loading class called "+p);
if (p.startsWith("uk.ac.gla.terrier"))
p = p.replaceAll("uk.ac.gla.terrier", "org.terrier");
objs[i] = ApplicationSetup.getClass(p);
} else if (p.endsWith("-inputstream"))// no caching for
// input streams
objs[i] = loadIndexStructure(p);
else if (p.matches("^\\$\\{.+\\}$")) {
String propertyName = p.substring(2, p.length() - 1);
objs[i] = properties.getProperty(propertyName,
ApplicationSetup
.getProperty(propertyName, null));
if (objs[i] == null)
throw new IllegalArgumentException("Property "
+ propertyName + " not found");
}
else if (p.matches("^\".+\"$")) {
String literal = p.substring(1, p.length() - 1);
if (param_types[i].equals(String.class))
objs[i] = literal;
else if (param_types[i].equals(Integer.class))
objs[i] = Integer.valueOf(Integer.parseInt(literal));
else
throw new IllegalArgumentException("Type "
+ param_types[i] + " is not supported for literal parameter values");
} else
objs[i] = getIndexStructure(p);
i++;
}
// get the index structure using the appropriate constructor
// with correct parameters
rtr = indexStructureClass.getConstructor(param_types)
.newInstance(objs);
} else { // no constructor arguments
rtr = indexStructureClass.newInstance();
}
// Special case hacks
// 1. set the Index properties if desired
if (rtr instanceof IndexConfigurable) {
((IndexConfigurable) rtr).setIndex(this);
}
// we're done
return rtr;
} catch (InvocationTargetException ite) {
logger.error("Couldn't load an index structure called "
+ structureName, ite.getCause());
return null;
} catch (Throwable t) {
logger.error("Couldn't load an index structure called "
+ structureName, t);
return null;
}
}
/**
* load all index structures. Is disabled if index property
* index.preloadIndices.disabled is set to true. It is false by
* default, which means that all non-inputstream indices are loaded on
* initialisation of the index. When the property is true, indices are
* loaded as required.
*/
protected void loadIndices() {
final boolean methodDisabled = Boolean.parseBoolean(properties
.getProperty("index.preloadIndices.disabled", "false"));
if (methodDisabled || !RETRIEVAL_LOADING_PROFILE)
return;
boolean OK = true;
// look for all index structures
for (Object oKey : properties.keySet()) {
final String sKey = (String) oKey;
if (sKey.matches("^index\\..+\\.class$")
&& !(sKey.matches("^index\\..+-inputstream.class$"))) // don't
// pre-load
// input
// streams
{
final String structureName = sKey.split("\\.")[1];
Object o = getIndexStructure(structureName);
if (o == null) {
loadError = "Could not load an index structure called "
+ structureName;
OK = false;
}
}
}
if (!OK)
this.loadSuccess = false;
}
/**
* loads in the properties file, falling back to the Terrier 1.xx log file
* if no properties exist.
*/
protected boolean loadProperties() {
try {
String propertiesFilename = path + ApplicationSetup.FILE_SEPARATOR
+ prefix + ".properties";
if (!allExists(propertiesFilename)) {
loadSuccess = false;
loadError = "Index not found: " + propertiesFilename
+ " not found.";
return false;
} else {
InputStream propertyStream = Files
.openFileStream(propertiesFilename);
properties.load(propertyStream);
propertyStream.close();
}
} catch (IOException ioe) {
loadSuccess = false;
logger.error("Problem loading index properties", ioe);
loadError = "Problem loading index properties: " + ioe;
return false;
}
if (properties.getProperty("index.terrier.version", null) == null) {
loadSuccess = false;
logger.error("index.terrier.version not set in index, invalid index?");
loadError = "index.terrier.version not set in index";
return false;
}
final String versionString = properties.getProperty(
"index.terrier.version", null);
if (versionString.equals("5.0-SNAPSHOT"))
logger.warn("Be careful, 5.0-SNAPSHOT index -- perhaps this index perhaps should be declared as 4.2?");
final String[] versionStringParts = versionString.split("\\.", 2);
final int MAJOR_VERSION = Integer.parseInt(versionStringParts[0]);
if (MAJOR_VERSION < MINIMUM_INDEX_TERRIER_VERSION) {
loadSuccess = false;
logger.error(loadError = "This index is too old. Need at least version "
+ MINIMUM_INDEX_TERRIER_VERSION + " index");
return false;
} else if (MAJOR_VERSION == 3) {
Terrier4 upgrade = new Terrier4();
upgrade.updateIndexProperties(properties);
logger.warn(this.toString() + " is a Terrier " + versionString
+ " index - temporarily upgrading. Use " + Terrier4.class.getName() + " to make changes permanent");
}
else if (MAJOR_VERSION == 4) {
Terrier5 upgrade = new Terrier5();
upgrade.updateIndexProperties(properties);
logger.warn(this.toString() + " is a Terrier " + versionString
+ " index - temporarily upgrading. Use " + Terrier5.class.getName() + " to make changes permanent. Some functionality may be lost.");
}
else if (MAJOR_VERSION > Version.getMajorVersion())
{
logger.warn(this.toString() + " is a Terrier " + versionString + " index, which is more modern than this release ("+Version.VERSION+"). YMMV!");
}
//ELSE: index is this version, all good
else {}
return true;
}
/**
* for an immutable index, use a normal collection statistics, never changes
*/
protected void loadStatistics() {
// calculate fields
int fieldCount = 0;
if (this.hasIndexStructure("inverted")) {
fieldCount = Integer.parseInt(properties.getProperty(
"index.inverted.fields.count", "0"));
} else if (this.hasIndexStructure("direct")) {
fieldCount = Integer.parseInt(properties.getProperty(
"index.direct.fields.count", "0"));
}
final long[] tokensF = new long[fieldCount];
for (int fi = 0; fi < fieldCount; fi++) {
tokensF[fi] = Long.parseLong(properties.getProperty("num.field."
+ fi + ".Tokens", "0"));
}
boolean blocks = false;
if (this.hasIndexStructure("inverted")) {
blocks = getIntIndexProperty("index.inverted.blocks", 0) > 0;
} else if (this.hasIndexStructure("direct")) {
blocks = getIntIndexProperty("index.direct.blocks", 0) > 0;
}
// create collection statistics
structureCache.put(
"collectionstatistics",
new CollectionStatistics(
Integer.parseInt(properties.getProperty("num.Documents", "0")),
Integer.parseInt(properties.getProperty("num.Terms", "0")),
Long.parseLong(properties.getProperty("num.Tokens", "0")),
Long.parseLong(properties.getProperty("num.Pointers", "0")),
tokensF,
ArrayUtils.parseCommaDelimitedString(properties.getProperty("index.inverted.fields.names", "")),
blocks
));
}
/**
* for an index that is not yet built, use an UpdatingCollectionStatistics,
* which is slower but can support updates of the index statistics
*/
protected void loadUpdatingStatistics() {
structureCache.put("collectionstatistics",
new UpdatingCollectionStatistics(this));
}
@Override
public String toString() {
return path + "/" + prefix + ".properties"; //"Index(" + path + "," + prefix + ")";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy