src.it.unimi.dsi.big.util.TernaryIntervalSearchTree Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dsiutils Show documentation
The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.
There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.big.util;

/*
 * DSI utilities
 *
 * Copyright (C) 2005-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.LongInterval;
import it.unimi.dsi.util.LongIntervals;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

/** Ternary interval search trees.
 *
 * Ternary search trees are a data structure used to store words over an alphabet; they are
 * a useful alternatives to tries when the alphabet is large.
 *
 * 
Ternary interval search trees have the additional properties of being able
 * to locate quickly intervals of words extending a given prefix (where “quickly” means
 * that no more successful character comparisons than the prefix length are performed). They do so
 * by storing at each node the number of words covered by that node.
 *
 * 
This implementation exposes a number of interfaces: in particular, the set of words is
 * seen as a lexicographically ordered {@link it.unimi.dsi.fastutil.objects.ObjectList}.
 *
 * This class is mutable, but for the time it implements only {@link #add(CharSequence)}. Words cannot
 * be removed.
 *
 * @since 2.0
 */

public class TernaryIntervalSearchTree extends AbstractPrefixMap implements Serializable {
	private static final long serialVersionUID = 1L;

	/** A node of the tree. */
	private final static class Node implements Serializable {
		private static final long serialVersionUID = 1L;
		/** A pointer to the left subtree. */
		public Node left;
		/** A pointer to the middle subtree. */
		public Node middle;
		/** A pointer to the right subtree. */
		public Node right;
		/** The nonempty path compressed at this node. */
		public char[] path;
		/** Whether this node represents a word. */
		public boolean isWord;
		/** The number of words covered by this node (including the word possibly represented by this node). */
		public long numNodes;

		/** Creates a new node containing a path specified by a character-sequence fragment.
		 *
		 * @param s a character sequence contaning the path of the node.
		 * @param offset the starting character of the path.
		 * @param length the length of the path.
		 * @param isWord whether this node represents a word.
		 * @param numNodes the number of words covered by this node.
		 */
		public Node(final CharSequence s, final int offset, final int length, final boolean isWord, final long numNodes) {
			path = new char[length];
			MutableString.getChars(s, offset, offset + length, path, 0);
			this.isWord = isWord;
			this.numNodes = numNodes;
		}

		/** Creates a new node containing a path specified by a character-array fragment.
		 *
		 * @param a a character array contaning the path of the node.
		 * @param offset the starting character of the path.
		 * @param length the length of the path.
		 * @param isWord whether this node represents a word.
		 * @param numNodes the number of words covered by this node.
		 */
		public Node(final char[] a, final int offset, final int length, final boolean isWord, final long numNodes) {
			path = new char[length];
			System.arraycopy(a, offset, path, 0, length);
			this.isWord = isWord;
			this.numNodes = numNodes;
		}

		/** Removes a prefix from the path of this node.
		 *
		 * @param length the length of the prefix to be removed
		 */

		public void removePathPrefix(final int length) {
			final char[] a = new char[path.length - length];
			System.arraycopy(path, length, a, 0, a.length);
			path = a;
		}
	}

	/** The root of the tree. */
	private Node root;

	/** The number of nodes in the tree. */
	private int size;

	/** Creates a new empty ternary search tree. */
	public TernaryIntervalSearchTree() {
		defRetValue = -1;
	}

	/** Creates a new empty ternary search tree and populates it with a given collection of character sequences.
	 *
	 * @param c a collection of character sequences.
	 * */
	public TernaryIntervalSearchTree(final Collection c) {
		int n = c.size();
		final Iterator i = c.iterator();
		while(n-- != 0) add(i.next());
		defRetValue = -1;
	}


	@Override
	protected LongInterval getInterval(final CharSequence s) {
		final int l = s.length();

		Node e = root;
		int i;
		int offset = 0;
		long wordsAtLeft = 0;
		char c;
		char[] path;

		while(e != null) {
			path = e.path;
			for(i = 0; i < path.length - 1 && offset + i < l && s.charAt(offset + i) == path[i]; i++);
			if (offset + i == l) return LongInterval.valueOf(wordsAtLeft, wordsAtLeft + e.numNodes - 1);
			if (i < path.length - 1) return LongIntervals.EMPTY_INTERVAL;
			offset += i;

			c = s.charAt(offset);
			if (c < path[i]) e = e.left;
			else if (c > path[i]) {
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (e.middle != null) wordsAtLeft += e.middle.numNodes;
				if (e.isWord) wordsAtLeft++;
				e = e.right;
			}
			else {
				offset++;
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (offset == l) return LongInterval.valueOf(wordsAtLeft, wordsAtLeft + (e.isWord ? 1 : 0) + (e.middle == null ? 0 : e.middle.numNodes) - 1);
				if (e.isWord) wordsAtLeft++;
				e = e.middle;
			}
		}

		return LongIntervals.EMPTY_INTERVAL;
	}


	public LongInterval getApproximatedInterval(final CharSequence s) {
		final int l = s.length();

		Node e = root;
		int i;
		int offset = 0;
		int wordsAtLeft = 0;
		char c;
		char[] path;

		while(e != null) {
			path = e.path;
			for(i = 0; i < path.length - 1 && offset + i < l && s.charAt(offset + i) == path[i]; i++);
			if (offset + i == l) {
				// Our sequence is a proper prefix of path.
				return wordsAtLeft > 0 ? LongInterval.valueOf(wordsAtLeft - 1, wordsAtLeft + e.numNodes - 1) : LongInterval.valueOf(wordsAtLeft, wordsAtLeft + e.numNodes - 1);
			}
			if (i < path.length - 1) {
				// We stopped the loop prematurely.

				if (s.charAt(offset + i) < path[i]) return wordsAtLeft > 0 ? LongInterval.valueOf(wordsAtLeft -1) : LongIntervals.EMPTY_INTERVAL;
				else return LongInterval.valueOf(wordsAtLeft + e.numNodes  - 1);
			}

			offset += i;

			c = s.charAt(offset);
			if (c < path[i]) e = e.left;
			else if (c > path[i]) {
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (e.middle != null) wordsAtLeft += e.middle.numNodes;
				if (e.isWord) wordsAtLeft++;
				e = e.right;
			}
			else {
				offset++;
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (offset == l) return LongInterval.valueOf(wordsAtLeft - (e.isWord ? 0 : 1), wordsAtLeft + (e.isWord ? 1 : 0) + (e.middle == null ? 0 : e.middle.numNodes) - 1);
				if (e.isWord) wordsAtLeft++;
				e = e.middle;
			}
		}

		return wordsAtLeft > 0 ? LongInterval.valueOf(wordsAtLeft - 1) : LongIntervals.EMPTY_INTERVAL;
	}

	@Override
	protected MutableString getTerm(long index, final MutableString s) {
		Node e = root;

		for(;;) {

			if (e.left != null) {
				if (index < e.left.numNodes) {
					s.append(e.path, 0, e.path.length - 1);
					e = e.left;
					continue;
				}

				index -= e.left.numNodes;
			}

			if (e.isWord) {
				if (index == 0) return s.append(e.path).compact();
				index--;
			}


			if (e.middle != null) {
				if (index < e.middle.numNodes) {
					s.append(e.path);
					e = e.middle;
					continue;
				}

				index -= e.middle.numNodes;
			}

			s.append(e.path, 0, e.path.length - 1);
			e = e.right;
		}
	}

	protected long getIndex(final CharSequence s) {
		final int l = s.length();

		Node e = root;
		int i;
		int offset = 0;
		int wordsAtLeft = 0;
		char c;
		char[] path;

		while(e != null) {
			path = e.path;
			for(i = 0; i < path.length - 1; i++)
				if (offset + i == l || s.charAt(offset + i) != path[i]) return -1;

			offset += i;
			if (offset == l) return -1;

			c = s.charAt(offset);
			if (c < e.path[i]) e = e.left;
			else if (c > e.path[i]) {
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (e.middle != null) wordsAtLeft += e.middle.numNodes;
				if (e.isWord) wordsAtLeft++;
				e = e.right;
			}
			else {
				offset++;
				if (e.left != null) wordsAtLeft += e.left.numNodes;
				if (offset == l) return e.isWord ? wordsAtLeft : -1;
				if (e.isWord) wordsAtLeft++;
				e = e.middle;
			}
		}

		return -1;
	}

	@Override
	public boolean containsKey(Object o) {
		return getIndex((CharSequence)o) != -1;
	}

	@Override
	public long getLong(final Object o) {
		final CharSequence s = (CharSequence)o;
		final long result = getIndex(s);
		return result == -1 ? defRetValue : result;
	}
	/** True if the last {@link #add(CharSequence)} modified the tree. */
	private boolean modified;

	public boolean add(final CharSequence s) {
		modified = false;
		root = addRec(s, 0, s.length(), root);
		return modified;
	}

	/** Inserts the given character sequence, starting at the given position, in the given subtree.
	 *
	 * @param s the character sequence containing the characters to be inserted.
	 * @param offset the first character to be inserted.
	 * @param length the number of characters to be inserted.
	 * @param e the subtree in which the characters should be inserted, or {@code null} if
	 * a new node should be created.
	 * @return the new node at the top of the subtree.
	 */

	private Node addRec(final CharSequence s, final int offset, final int length, final Node e) {

		if (e == null) {
			// We create a new node containing all the characters and return it.
			modified = true;
			size++;
			return new Node(s, offset, length, true, 1);
		}

		/* We start scanning the path contained in the current node, up to
		 * the last character excluded. If we find a mismatch, or if we exhaust our
		 * characters, we must fork this node. */

		char c;
		int i;
		Node n = null;
		final char[] path = e.path;

		for (i = 0; i < path.length - 1; i++) {
			c = s.charAt(offset + i);

			if (c < path[i]) {
				/* We fork on the left, keeping just the first i + 1 characters (this is necessary
				 * as at least one character must be present in every node). The new
				 * node will cover one word more than e.
				 */
				n = new Node(path, 0, i + 1, false, e.numNodes + 1);

				n.middle = e;
				e.removePathPrefix(i + 1);

				n.left = addRec(s, offset + i, length - i, null);
				break;
			}
			else if (c > path[i]) {
				// As before, but on the right.
				n = new Node(path, 0, i + 1, false, e.numNodes + 1);

				n.middle = e;
				e.removePathPrefix(i + 1);

				n.right = addRec(s, offset + i, length - i, null);
				break;
			}
			else {
				if (i == length - 1) {
					/* We exhausted the character sequence. We fork in the middle,
					 * keeping length characters and marking the new node as
					 * containing one work. Again, the new code will cover one word
					 * more than e. */
					n = new Node(s, offset, length, true, e.numNodes + 1);
					n.middle = e;
					e.removePathPrefix(length);
					size++;
					modified = true;
					break;
				}
			}
		}

		if (i < path.length - 1) return n;

		/* We are positioned on the last character of the path. In this case our
		 * behaviour is different, as if we must fork we must not perform any
		 * splitting. Moreover, if we exhaust the characters we either found
		 * the new sequence in the tree, or we just have to mark the node. */

		c = s.charAt(offset + i);

		if (c < path[i]) {
			/** We fork on the left. The number of words under this node will
			 * increase only if the structure is modified. */
			e.left = addRec(s, offset + i, length - i, e.left);
			if (modified) e.numNodes++;
		}
		else if (c > path[i]) {
			e.right = addRec(s, offset + i, length - i, e.right);
			if (modified) e.numNodes++;
		}
		else {
			if (i == length - 1) {
				// This is the node.
				if (modified = !e.isWord) {
					e.numNodes++;
					size++;
				}
				e.isWord = true;
			}
			else {
				// We add a node in the middle, completing the sequence.
				e.middle = addRec(s, offset + i + 1, length - i - 1, e.middle);
				if (modified) e.numNodes++;
			}
		}

		return e;
	}

	@Override
	@Deprecated
	public int size() {
		return size;
	}

	@Override
	public long size64() {
		return size;
	}

	public static void main(final String[] arg) throws IOException, JSAPException, NoSuchMethodException {

		final SimpleJSAP jsap = new SimpleJSAP(TernaryIntervalSearchTree.class.getName(), "Builds a ternary interval search tree reading from standard input a newline-separated list of terms.",
			new Parameter[] {
				new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms."),
				new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding."),
				new UnflaggedOption("tree", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised tree.")
		});

		JSAPResult jsapResult = jsap.parse(arg);
		if (jsap.messagePrinted()) return;

		final TernaryIntervalSearchTree tree = new TernaryIntervalSearchTree();

		MutableString term = new MutableString();
		final ProgressLogger pl = new ProgressLogger();
		pl.itemsName = "terms";
		@SuppressWarnings("resource")
		final FastBufferedReader terms = new FastBufferedReader(new InputStreamReader(System.in, (Charset)jsapResult.getObject("encoding")), jsapResult.getInt("bufferSize"));

		pl.start("Reading terms...");

		while(terms.readLine(term) != null) {
			pl.update();
			tree.add(term);
		}

		pl.done();

		BinIO.storeObject(tree, jsapResult.getString("tree"));
	}

}