edu.stanford.nlp.util.HashIndex Maven / Gradle / Ivy
package edu.stanford.nlp.util;
import java.io.*;
import java.util.*;
import java.util.concurrent.Semaphore;
import java.util.function.Supplier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
/**
* Implements an Index that supports constant-time lookup in
* both directions (via {@code get(int)} and {@code indexOf(E)}.
* The {@code indexOf(E)} method compares objects by
* {@code equals()}, as other Collections.
*
* The typical usage would be:
* {@code Index index = new Index(collection);}
* followed by
*
{@code int i = index.indexOf(str);}
*
or
*
{@code String s = index.get(i);}
*
An Index can be locked or unlocked: a locked index cannot have new
* items added to it.
*
* @author Dan Klein
* @version 1.0
* @see AbstractCollection
* @since 1.0
* @author Eric Yeh (added write to/load from buffer)
*/
// todo [cdm 2014]: Delete "extends AbstractCollection" but this will break serialization....
public class HashIndex extends AbstractCollection implements Index, RandomAccess {
// these variables are also used in IntArrayIndex
private final List objects; // <-- Should really almost always be an ArrayList
private final Map indexes;
private boolean locked; // = false; // Mutable
private static final long serialVersionUID = 5398562825928375260L;
/**
* Clears this Index.
*/
@Override
public void clear() {
objects.clear();
indexes.clear();
}
/**
* Returns the index of each elem in a List.
* @param elements The list of items
* @return An array of indices
*/
public int[] indices(Collection elements) {
int[] indices = new int[elements.size()];
int i = 0;
for (E elem : elements) {
indices[i++] = indexOf(elem);
}
return indices;
}
/**
* Looks up the objects corresponding to an array of indices, and returns them in a {@link Collection}.
* This collection is not a copy, but accesses the data structures of the Index.
*
* @param indices An array of indices
* @return a {@link Collection} of the objects corresponding to the indices argument.
*/
@Override
public Collection objects(final int[] indices) {
return new AbstractList() {
@Override
public E get(int index) {
return objects.get(indices[index]);
}
@Override
public int size() {
return indices.length;
}
};
}
/**
* Returns the number of indexed objects.
*
* @return the number of indexed objects.
*/
@Override
public int size() {
return objects.size();
}
/**
* Gets the object whose index is the integer argument.
*
* @param i the integer index to be queried for the corresponding argument
* @return the object whose index is the integer argument.
*/
@Override
public E get(int i) {
if (i < 0 || i >= objects.size())
throw new ArrayIndexOutOfBoundsException("Index " + i +
" outside the bounds [0," +
size() + ")");
return objects.get(i);
}
/**
* Returns a complete {@link List} of indexed objects, in the order of their indices. DANGER!
* The current implementation returns the actual index list, not a defensive copy. Messing with this List
* can seriously screw up the state of the Index. (perhaps this method needs to be eliminated? I don't think it's
* ever used in ways that we couldn't use the Index itself for directly. --Roger, 12/29/04)
*
* @return a complete {@link List} of indexed objects
*/
@Override
public List objectsList() {
return objects;
}
/**
* Queries the Index for whether it's locked or not.
* @return whether or not the Index is locked
*/
@Override
public boolean isLocked() {
return locked;
}
/** Locks the Index. A locked index cannot have new elements added to it (calls to {@link #add} will
* leave the Index unchanged and return {@code false}).*/
@Override
public void lock() {
locked = true;
}
/** Unlocks the Index. A locked index cannot have new elements added to it (calls to {@link #add} will
* leave the Index unchanged and return {@code false}).*/
@Override
public void unlock() {
locked = false;
}
/** {@inheritDoc} */
@Override
public int indexOf(E o) {
Integer index = indexes.get(o);
if (index == null) {
return -1;
}
return index;
}
@Override
public int addToIndex(E o) {
Integer index = indexes.get(o);
if (index == null) {
if ( ! locked) {
try {
semaphore.acquire();
index = indexes.get(o);
if (index == null) {
index = objects.size();
objects.add(o);
indexes.put(o, index);
}
semaphore.release();
} catch (InterruptedException e) {
throw new RuntimeInterruptedException(e);
}
} else {
return -1;
}
}
return index;
}
/**
* Add the given item to the index, but without taking any locks.
* Use this method with care!
* But, this offers a noticable performance improvement if it is safe to use.
*
* @see Index#addToIndex(E)
*/
public int addToIndexUnsafe(E o) {
if (indexes.isEmpty()) { // a surprisingly common case in TokensRegex
objects.add(o);
indexes.put(o, 0);
return 0;
} else {
Integer index = indexes.get(o);
if (index == null) {
if (locked) {
index = -1;
} else {
index = objects.size();
objects.add(o);
indexes.put(o, index);
}
}
return index;
}
}
/**
* Takes an Object and returns the integer index of the Object,
* perhaps adding it to the index first.
* Returns -1 if the Object is not in the Index.
*
* Notes: The method indexOf(x, true) is the direct replacement for
* the number(x) method in the old Numberer class. This method now uses a
* Semaphore object to make the index safe for concurrent multithreaded
* usage. (CDM: Is this better than using a synchronized block?)
*
* @param o the Object whose index is desired.
* @param add Whether it is okay to add new items to the index
* @return The index of the Object argument. Returns -1 if the object is not in the index.
*/
@Override
@Deprecated
public int indexOf(E o, boolean add) {
if (add) {
return addToIndex(o);
} else {
return indexOf(o);
}
}
private final Semaphore semaphore = new Semaphore(1);
// TODO: delete this when breaking serialization because we can leach off of AbstractCollection
/**
* Adds every member of Collection to the Index. Does nothing for members already in the Index.
*
* @return true if some item was added to the index and false if no
* item was already in the index or if the index is locked
*/
@Override
public boolean addAll(Collection extends E> c) {
boolean changed = false;
for (E element: c) {
changed |= add(element);
//changed &= add(element);
}
return changed;
}
/**
* Adds an object to the Index. If it was already in the Index,
* then nothing is done. If it is not in the Index, then it is
* added iff the Index hasn't been locked.
*
* @return true if the item was added to the index and false if the
* item was already in the index or if the index is locked
*/
@Override
public boolean add(E o) {
Integer index = indexes.get(o);
if (index == null && ! locked) {
index = objects.size();
objects.add(o);
indexes.put(o, index);
return true;
}
return false;
}
/**
* Checks whether an Object already has an index in the Index
* @param o the object to be queried.
* @return true iff there is an index for the queried object.
*/
@SuppressWarnings({"SuspiciousMethodCalls"})
@Override
public boolean contains(Object o) {
return indexes.containsKey(o);
}
/**
* Creates a new Index.
*/
public HashIndex() {
super();
objects = new ArrayList<>();
indexes = Generics.newHashMap();
}
/**
* Creates a new Index.
* @param capacity Initial capacity of Index.
*/
public HashIndex(int capacity) {
super();
objects = new ArrayList<>(capacity);
indexes = Generics.newHashMap(capacity);
}
/**
* Create a new HashIndex
, backed by the given collection types.
* @param objLookupFactory The constructor for the object lookup -- traditionally an {@link ArrayList}.
* @param indexLookupFactory The constructor for the index lookup -- traditionally a {@link HashMap}.
*/
public HashIndex(Supplier> objLookupFactory, Supplier