All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aowagie.text.pdf.hyphenation.TernaryTree Maven / Gradle / Ivy

/*
 * Copyright 1999-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.aowagie.text.pdf.hyphenation;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Stack;

/**
 * 

Ternary Search Tree.

* *

A ternary search tree is a hybrid between a binary tree and * a digital search tree (trie). Keys are limited to strings. * A data value of type char is stored in each leaf node. * It can be used as an index (or pointer) to the data. * Branches that only contain one key are compressed to one node * by storing a pointer to the trailer substring of the key. * This class is intended to serve as base class or helper class * to implement Dictionary collections or the like. Ternary trees * have some nice properties as the following: the tree can be * traversed in sorted order, partial matches (wildcard) can be * implemented, retrieval of all keys within a given distance * from the target, etc. The storage requirements are higher than * a binary tree but a lot less than a trie. Performance is * comparable with a hash table, sometimes it outperforms a hash * function (most of the time can determine a miss faster than a hash).

* *

The main purpose of this java port is to serve as a base for * implementing TeX's hyphenation algorithm (see The TeXBook, * appendix H). Each language requires from 5000 to 15000 hyphenation * patterns which will be keys in this tree. The strings patterns * are usually small (from 2 to 5 characters), but each char in the * tree is stored in a node. Thus memory usage is the main concern. * We will sacrifice 'elegance' to keep memory requirements to the * minimum. Using java's char type as pointer (yes, I know pointer * it is a forbidden word in java) we can keep the size of the node * to be just 8 bytes (3 pointers and the data char). This gives * room for about 65000 nodes. In my tests the English patterns * took 7694 nodes and the German patterns 10055 nodes, * so I think we are safe.

* *

All said, this is a map with strings as keys and char as value. * Pretty limited!. It can be extended to a general map by * using the string representation of an object and using the * char value as an index to an array that contains the object * values.

* * @author [email protected] */ class TernaryTree implements Cloneable, Serializable { /** * We use 4 arrays to represent a node. I guess I should have created * a proper node class, but somehow Knuth's pascal code made me forget * we now have a portable language with virtual memory management and * automatic garbage collection! And now is kind of late, furthermore, * if it ain't broken, don't fix it. */ private static final long serialVersionUID = 5313366505322983510L; /** * Pointer to low branch and to rest of the key when it is * stored directly in this node, we don't have unions in java! */ protected char[] lo; /** * Pointer to high branch. */ protected char[] hi; /** * Pointer to equal branch and to data when this node is a string terminator. */ protected char[] eq; /** *

The character stored in this node: splitchar. * Two special values are reserved:

*
  • 0x0000 as string terminator
  • *
  • 0xFFFF to indicate that the branch starting at * this node is compressed
*

This shouldn't be a problem if we give the usual semantics to * strings since 0xFFFF is guaranteed not to be an Unicode character.

*/ protected char[] sc; /** * This vector holds the trailing of the keys when the branch is compressed. */ protected CharVector kv; protected char root; private char freenode; private int length; // number of items in tree private static final int BLOCK_SIZE = 2048; // allocation size for arrays TernaryTree() { init(); } private void init() { this.root = 0; this.freenode = 1; this.length = 0; this.lo = new char[BLOCK_SIZE]; this.hi = new char[BLOCK_SIZE]; this.eq = new char[BLOCK_SIZE]; this.sc = new char[BLOCK_SIZE]; this.kv = new CharVector(); } /** * Branches are initially compressed, needing * one node per key plus the size of the string * key. They are decompressed as needed when * another key with same prefix * is inserted. This saves a lot of space, * specially for long keys. */ void insert(final String key, final char val) { // make sure we have enough room in the arrays int len = key.length() + 1; // maximum number of nodes that may be generated if (this.freenode + len > this.eq.length) { redimNodeArrays(this.eq.length + BLOCK_SIZE); } final char strkey[] = new char[len--]; key.getChars(0, len, strkey, 0); strkey[len] = 0; this.root = insert(this.root, strkey, 0, val); } void insert(final char[] key, final int start, final char val) { final int len = strlen(key) + 1; if (this.freenode + len > this.eq.length) { redimNodeArrays(this.eq.length + BLOCK_SIZE); } this.root = insert(this.root, key, start, val); } /** * The actual insertion function, recursive version. */ private char insert(char p, final char[] key, final int start, final char val) { final int len = strlen(key, start); if (p == 0) { // this means there is no branch, this node will start a new branch. // Instead of doing that, we store the key somewhere else and create // only one node with a pointer to the key p = this.freenode++; this.eq[p] = val; // holds data this.length++; this.hi[p] = 0; if (len > 0) { this.sc[p] = 0xFFFF; // indicates branch is compressed this.lo[p] = (char)this.kv.alloc(len + 1); // use 'lo' to hold pointer to key strcpy(this.kv.getArray(), this.lo[p], key, start); } else { this.sc[p] = 0; this.lo[p] = 0; } return p; } if (this.sc[p] == 0xFFFF) { // branch is compressed: need to decompress // this will generate garbage in the external key array // but we can do some garbage collection later final char pp = this.freenode++; this.lo[pp] = this.lo[p]; // previous pointer to key this.eq[pp] = this.eq[p]; // previous pointer to data this.lo[p] = 0; if (len > 0) { this.sc[p] = this.kv.get(this.lo[pp]); this.eq[p] = pp; this.lo[pp]++; if (this.kv.get(this.lo[pp]) == 0) { // key completely decompressed leaving garbage in key array this.lo[pp] = 0; this.sc[pp] = 0; this.hi[pp] = 0; } else { // we only got first char of key, rest is still there this.sc[pp] = 0xFFFF; } } else { // In this case we can save a node by swapping the new node // with the compressed node this.sc[pp] = 0xFFFF; this.hi[p] = pp; this.sc[p] = 0; this.eq[p] = val; this.length++; return p; } } final char s = key[start]; if (s < this.sc[p]) { this.lo[p] = insert(this.lo[p], key, start, val); } else if (s == this.sc[p]) { if (s != 0) { this.eq[p] = insert(this.eq[p], key, start + 1, val); } else { // key already in tree, overwrite data this.eq[p] = val; } } else { this.hi[p] = insert(this.hi[p], key, start, val); } return p; } /** * Compares 2 null terminated char arrays */ private static int strcmp(final char[] a, int startA, final char[] b, int startB) { for (; a[startA] == b[startB]; startA++, startB++) { if (a[startA] == 0) { return 0; } } return a[startA] - b[startB]; } private static void strcpy(final char[] dst, int di, final char[] src, int si) { while (src[si] != 0) { dst[di++] = src[si++]; } dst[di] = 0; } private static int strlen(final char[] a, final int start) { int len = 0; for (int i = start; i < a.length && a[i] != 0; i++) { len++; } return len; } private static int strlen(final char[] a) { return strlen(a, 0); } int find(final String key) { final int len = key.length(); final char strkey[] = new char[len + 1]; key.getChars(0, len, strkey, 0); strkey[len] = 0; return find(strkey, 0); } int find(final char[] key, final int start) { int d; char p = this.root; int i = start; char c; while (p != 0) { if (this.sc[p] == 0xFFFF) { if (strcmp(key, i, this.kv.getArray(), this.lo[p]) == 0) { return this.eq[p]; } else { return -1; } } c = key[i]; d = c - this.sc[p]; if (d == 0) { if (c == 0) { return this.eq[p]; } i++; p = this.eq[p]; } else if (d < 0) { p = this.lo[p]; } else { p = this.hi[p]; } } return -1; } // redimension the arrays private void redimNodeArrays(final int newsize) { final int len = newsize < this.lo.length ? newsize : this.lo.length; char[] na = new char[newsize]; System.arraycopy(this.lo, 0, na, 0, len); this.lo = na; na = new char[newsize]; System.arraycopy(this.hi, 0, na, 0, len); this.hi = na; na = new char[newsize]; System.arraycopy(this.eq, 0, na, 0, len); this.eq = na; na = new char[newsize]; System.arraycopy(this.sc, 0, na, 0, len); this.sc = na; } @Override public Object clone() { final TernaryTree t = new TernaryTree(); t.lo = this.lo.clone(); t.hi = this.hi.clone(); t.eq = this.eq.clone(); t.sc = this.sc.clone(); t.kv = (CharVector)this.kv.clone(); t.root = this.root; t.freenode = this.freenode; t.length = this.length; return t; } /** * Recursively insert the median first and then the median of the * lower and upper halves, and so on in order to get a balanced * tree. The array of keys is assumed to be sorted in ascending * order. */ private void insertBalanced(final String[] k, final char[] v, final int offset, final int n) { int m; if (n < 1) { return; } m = n >> 1; insert(k[m + offset], v[m + offset]); insertBalanced(k, v, offset, m); insertBalanced(k, v, offset + m + 1, n - m - 1); } /** * Balance the tree for best search performance */ private void balance() { // System.out.print("Before root splitchar = "); System.out.println(sc[root]); int i = 0; final int n = this.length; final String[] k = new String[n]; final char[] v = new char[n]; final Iterator iter = new Iterator(); while (iter.hasMoreElements()) { v[i] = iter.getValue(); k[i++] = (String)iter.nextElement(); } init(); insertBalanced(k, v, 0, n); // With uniform letter distribution sc[root] should be around 'm' // System.out.print("After root splitchar = "); System.out.println(sc[root]); } /** * Each node stores a character (splitchar) which is part of * some key(s). In a compressed branch (one that only contain * a single string key) the trailer of the key which is not * already in nodes is stored externally in the kv array. * As items are inserted, key substrings decrease. * Some substrings may completely disappear when the whole * branch is totally decompressed. * The tree is traversed to find the key substrings actually * used. In addition, duplicate substrings are removed using * a map (implemented with a TernaryTree!). * */ void trimToSize() { // first balance the tree for best performance balance(); // redimension the node arrays redimNodeArrays(this.freenode); // ok, compact kv array final CharVector kx = new CharVector(); kx.alloc(1); final TernaryTree map = new TernaryTree(); compact(kx, map, this.root); this.kv = kx; this.kv.trimToSize(); } private void compact(final CharVector kx, final TernaryTree map, final char p) { int k; if (p == 0) { return; } if (this.sc[p] == 0xFFFF) { k = map.find(this.kv.getArray(), this.lo[p]); if (k < 0) { k = kx.alloc(strlen(this.kv.getArray(), this.lo[p]) + 1); strcpy(kx.getArray(), k, this.kv.getArray(), this.lo[p]); map.insert(kx.getArray(), k, (char)k); } this.lo[p] = (char)k; } else { compact(kx, map, this.lo[p]); if (this.sc[p] != 0) { compact(kx, map, this.eq[p]); } compact(kx, map, this.hi[p]); } } private class Iterator implements Enumeration { /** * current node index */ private int cur; /** * current key */ private String curkey; private class Item implements Cloneable { char parent; char child; public Item() { this.parent = 0; this.child = 0; } public Item(final char p, final char c) { this.parent = p; this.child = c; } @Override public Object clone() { return new Item(this.parent, this.child); } } /** * Node stack */ private final Stack ns; /** * key stack implemented with a StringBuffer */ private final StringBuffer ks; public Iterator() { this.cur = -1; this.ns = new Stack(); this.ks = new StringBuffer(); rewind(); } private void rewind() { this.ns.removeAllElements(); this.ks.setLength(0); this.cur = TernaryTree.this.root; run(); } @Override public Object nextElement() { final String res = this.curkey; this.cur = up(); run(); return res; } public char getValue() { if (this.cur >= 0) { return TernaryTree.this.eq[this.cur]; } return 0; } @Override public boolean hasMoreElements() { return this.cur != -1; } /** * traverse upwards */ private int up() { Item i = new Item(); int res = 0; if (this.ns.empty()) { return -1; } if (this.cur != 0 && TernaryTree.this.sc[this.cur] == 0) { return TernaryTree.this.lo[this.cur]; } boolean climb = true; while (climb) { i = (Item)this.ns.pop(); i.child++; switch (i.child) { case 1: if (TernaryTree.this.sc[i.parent] != 0) { res = TernaryTree.this.eq[i.parent]; this.ns.push(i.clone()); this.ks.append(TernaryTree.this.sc[i.parent]); } else { i.child++; this.ns.push(i.clone()); res = TernaryTree.this.hi[i.parent]; } climb = false; break; case 2: res = TernaryTree.this.hi[i.parent]; this.ns.push(i.clone()); if (this.ks.length() > 0) { this.ks.setLength(this.ks.length() - 1); // pop } climb = false; break; default: if (this.ns.empty()) { return -1; } climb = true; break; } } return res; } /** * traverse the tree to find next key */ private int run() { if (this.cur == -1) { return -1; } boolean leaf = false; while (true) { // first go down on low branch until leaf or compressed branch while (this.cur != 0) { if (TernaryTree.this.sc[this.cur] == 0xFFFF) { leaf = true; break; } this.ns.push(new Item((char)this.cur, '\u0000')); if (TernaryTree.this.sc[this.cur] == 0) { leaf = true; break; } this.cur = TernaryTree.this.lo[this.cur]; } if (leaf) { break; } // nothing found, go up one node and try again this.cur = up(); if (this.cur == -1) { return -1; } } // The current node should be a data node and // the key should be in the key stack (at least partially) final StringBuffer buf = new StringBuffer(this.ks.toString()); if (TernaryTree.this.sc[this.cur] == 0xFFFF) { int p = TernaryTree.this.lo[this.cur]; while (TernaryTree.this.kv.get(p) != 0) { buf.append(TernaryTree.this.kv.get(p++)); } } this.curkey = buf.toString(); return 0; } } public void printStats() { System.out.println("Number of keys = " + Integer.toString(this.length)); System.out.println("Node count = " + Integer.toString(this.freenode)); // System.out.println("Array length = " + Integer.toString(eq.length)); System.out.println("Key Array length = " + Integer.toString(this.kv.length())); /* * for(int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy