All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.predict4all.nlp.ngram.trie.AbstractNGramTrieNode Maven / Gradle / Ivy

/*
 * Copyright 2020 - Mathieu THEBAUD
 *
 * Licensed under the Apache License, Version 2.0 (the "License")
 * you may not use this file except in compliance with the License.
 *
 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */

package org.predict4all.nlp.ngram.trie;

import gnu.trove.procedure.TObjectProcedure;
import org.predict4all.nlp.ngram.trie.map.TrieNodeMap;

import java.nio.channels.FileChannel;

/**
 * Represent a node in a trie structure to represent ngrams. Trie structure is used to save memory because information about ngram are very redundant.
*
* For example, the trie for sentences "this is sentence. this is what" will contains following nodes (showing 3 gram only): *

* - this
* --- is
* ----- sentence
* ----- what
*

* Trie node can be static or dynamic, where both has different application :
*

    *
  • Static : static trie node are node loaded on demand while browsing the trie structure. Their frequencies and backoff weight are pre-computed and used "as it" by the dictionary. Static node doesn't support * insertion/remove. They are useful to browse huge ngram trie with a limited memory use.
  • *
  • Dynamic : dynamic trie node are fully loaded (which mean than the whole trie is loaded into memory) and they support insertion/removal. Their frequencies and bow are computed but can be dynamically computed because the count * values are loaded. They are useful to train a ngram model (counting) or when the ngram trie is small (e.g. user ngram model).
  • *
* * @param node children type (typically this node type) * @author Mathieu THEBAUD */ public abstract class AbstractNGramTrieNode> { private static final int INTEGER_BYTE_SIZE = 4; private static final int DOUBLE_BYTE_SIZE = 8; // ATTRIBUTE // ======================================================================== /** * Static node byte size (3 integer, 2 double).
* Integer : word id, children size, children position.
* Double : frequency, backoff weight. */ public static final int STATIC_TRIE_NODE_SIZE_BYTE = 3 * INTEGER_BYTE_SIZE + 2 * DOUBLE_BYTE_SIZE; /** * Dynamic node byte size (4 integer) * Integer : word id, children size, children position, count */ public static final int DYNAMIC_TRIE_NODE_SIZE_BYTE = 4 * INTEGER_BYTE_SIZE; /** * Contains the children nodes position in file.
* Position in a {@link FileChannel} is a long type, but to save memory the value is stored as an int (trie file never contains more than {@link Integer#MAX_VALUE} byte) */ protected int childrenPosition = -1; /** * Represent the children node for this node.
* Each child is stored by its value (= word id) and represent the possible next value.
* To save memory, the map is created on demand, so even if this node has children, the map can be null if children are not loaded yet. */ protected TrieNodeMap children; /** * Computed frequency for this node */ protected double frequency; /** * Backoff weight for this node children frequencies */ protected double childrenBackoffWeight = 1.0; // ======================================================================== // SIMPLE API // ======================================================================== /** * @return this node computed frequency */ public double getFrequency() { return frequency; } /** * @return this node children backoff weight */ public double getChildrenBackoffWeight() { return childrenBackoffWeight; } /** * @return this node children (can be null if this node has no children, or if children are not loaded) */ public TrieNodeMap getChildren() { return this.children; } /** * @return the different children count (not the total children count) */ public abstract int getChildrenSize(); // ======================================================================== // COMPACT // ======================================================================== private static final TObjectProcedure> COMPACT_CHILD_PROCEDURE = node -> { node.compact(); return true; }; /** * compact the children of this node (if this node has children) */ public void compact() { if (children != null) { children.compact(); this.children.forEachValue(COMPACT_CHILD_PROCEDURE); } } // ======================================================================== }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy