All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.BreakCTDictionary Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 *******************************************************************************
 * Copyright (C) 1996-2011, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.CharacterIterator;

import com.ibm.icu.impl.ICUBinary;

/*
 * This is a class used to load in the compact trie dictionary file
 * used for dictionary based break iteration. 
 */
class BreakCTDictionary {
    private CompactTrieHeader fData;

    static class CompactTrieHeader {
        int size; // Size of data in bytes

        int magic; // Magic number (including versions)

        int nodeCount; // Number of entries in offsets[]

        int root; // Node number of the root node

        int []offset; // Offsets to nodes from start of data

        CompactTrieHeader() {
            size = 0;
            magic = 0;
            nodeCount = 0;
            root = 0;
            offset = null;
        }
    }

    static final class CompactTrieNodeFlags {
        static final int kVerticalNode = 0x1000; // This is a vertical node

        static final int kParentEndsWord = 0x2000; // The node whose equal link
                                                    // points to this ends a
                                                    // word

        static final int kReservedFlag1 = 0x4000;

        static final int kReservedFlag2 = 0x8000;

        static final int kCountMask = 0x0FFF; // The count portion of
                                                // flagscount

        static final int kFlagMask = 0xF000; // The flags portion of
                                                // flagscount
    }

    // The two node types are distinguished by the kVerticalNode flag.
    static class CompactTrieHorizontalNode {
        char ch; // UChar

        int equal; // Equal link node index

        CompactTrieHorizontalNode(char newCh, int newEqual) {
            ch = newCh;
            equal = newEqual;
        }
    }

    static class CompactTrieVerticalNode {
        int equal; // Equal link node index

        char chars[]; // Code units

        CompactTrieVerticalNode() {
            equal = 0;
            chars = null;
        }
    }

    private CompactTrieNodes getCompactTrieNode(int node) {
        return nodes[node];
    }

    // private class to hold both node information
    static class CompactTrieNodes {
        short flagscount; // Count of sub-entries, plus flags

        CompactTrieHorizontalNode[] hnode;

        CompactTrieVerticalNode vnode;

        CompactTrieNodes() {
            flagscount = 0;
            hnode = null;
            vnode = null;
        }
    }

    private CompactTrieNodes[] nodes;

    // Constructor
    public BreakCTDictionary(InputStream is) throws IOException {
        ICUBinary.readHeader(is, DATA_FORMAT_ID, null);

        DataInputStream in = new DataInputStream(is);
        // Get header information
        fData = new CompactTrieHeader();
        fData.size = in.readInt();
        fData.magic = in.readInt();
        fData.nodeCount = in.readShort();
        fData.root = in.readShort();

        loadBreakCTDictionary(in);
    }

    // Loads the compact trie dictionary file into the CompactTrieNodes
    private void loadBreakCTDictionary(DataInputStream in) throws IOException {
        // skip over offset information
        for (int i = 0; i < fData.nodeCount; i++) {
            in.readInt();
        }

        // Create compact trie dictionary
        nodes = new CompactTrieNodes[fData.nodeCount];
        nodes[0] = new CompactTrieNodes();

        // Load in compact trie dictionary
        for (int j = 1; j < fData.nodeCount; j++) {
            nodes[j] = new CompactTrieNodes();
            nodes[j].flagscount = in.readShort();

            int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask;

            if (count != 0) {
                boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0;

                // Vertical node
                if (isVerticalNode) {
                    nodes[j].vnode = new CompactTrieVerticalNode();
                    nodes[j].vnode.equal = in.readShort();

                    nodes[j].vnode.chars = new char[count];
                    for (int l = 0; l < count; l++) {
                        nodes[j].vnode.chars[l] = in.readChar();
                    }
                } else { // Horizontal node
                    nodes[j].hnode = new CompactTrieHorizontalNode[count];
                    for (int n = 0; n < count; n++) {
                        nodes[j].hnode[n] = new CompactTrieHorizontalNode(in
                                .readChar(), in.readShort());
                    }
                }
            }
        }
    }

    /**
     * Find dictionary words that match the text.
     * 
     * @param text A CharacterIterator representing the text. The iterator is
     *            left after the longest prefix match in the dictionary.
     * @param maxLength The maximum number of code units to match.
     * @param lengths An array that is filled with the lengths of words that matched.
     * @param count Filled with the number of elements output in lengths.
     * @param limit The size of the lengths array; this limits the number of words output.
     * @return The number of characters in text that were matched.
     */
    public int matches(CharacterIterator text, int maxLength, int lengths[],
            int count[], int limit) {
        // Current implementation works in UTF-16 space
        CompactTrieNodes node = getCompactTrieNode(fData.root);
        int mycount = 0;

        char uc = text.current();
        int i = 0;
        boolean exitFlag = false;

        while (node != null) {
            // Check if the node we just exited ends a word
            if (limit > 0
                    && (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) {
                lengths[mycount++] = i;
                --limit;
            }
            // Check that we haven't exceeded the maximum number of input
            // characters.
            // We have to do that here rather than in the while condition so
            // that
            // we can check for ending of a word above.
            if (i >= maxLength) {
                break;
            }

            int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask);
            if (nodeCount == 0) {
                // Special terminal node; return now
                break;
            }
            if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) {
                // Vertical node; check all the characters in it
                CompactTrieVerticalNode vnode = node.vnode;
                for (int j = 0; j < nodeCount && i < maxLength; j++) {
                    if (uc != vnode.chars[j]) {
                        // We hit a non equal character return;
                        exitFlag = true;
                        break;
                    }
                    text.next();
                    uc = text.current();
                    i++;
                }
                if (exitFlag) {
                    break;
                }
                // To get here, we must have come through the whole list successfully;
                // go on to the next node. Note that a word cannot end in the middle 
                // of a vertical node.
                node = getCompactTrieNode(vnode.equal);
            } else {
                // Horizontal node; do binary search
                CompactTrieHorizontalNode[] hnode = node.hnode;
                int low = 0;
                int high = nodeCount - 1;
                int middle;
                node = null; // If we don't find a match, we'll fall out of the loop
                while (high >= low) {
                    middle = (high + low) >>> 1;
                    if (uc == hnode[middle].ch) {
                        // We hit a match; get the next node and next character
                        node = getCompactTrieNode(hnode[middle].equal);
                        text.next();
                        uc = text.current();
                        i++;
                        break;
                    } else if (uc < hnode[middle].ch) {
                        high = middle - 1;
                    } else {
                        low = middle + 1;
                    }
                }
            }
        }

        count[0] = mycount;
        return i;
    }

    // Use for reading the header portion of the file
    private static final byte DATA_FORMAT_ID[] = { (byte) 0x54, (byte) 0x72,
            (byte) 0x44, (byte) 0x63 };
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy