All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.utility.TermCodes Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is TermCodes.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Vassilis Plachouras  (original author)
 */
package org.terrier.utility;
import gnu.trove.TObjectIntHashMap;
/**
 * 

This class is used for assigning codes to terms as we * index a document collection.

*

It makes use of two properties from the default * properties file. The first one is termcodes.initialcapacity, * which specifies the initial capacity of the used hash map. The default * value is 3000000.

*

The second property is termcodes.garbagecollect, * which enables or disables garbage collection during the call * of the method reset(). The default value is true. * * @author Vassilis Plachouras */ public class TermCodes { /** The initial capacity of the hashmap.*/ private static int hashMapCapacity; /** * The hashmap that stores the mapping * from terms hash codes to code. */ private final TObjectIntHashMap map = new TObjectIntHashMap(hashMapCapacity); /** * The counter that represents the new * code for the next not already encountered term. */ private int counter = 0; /** A buffer variable.*/ private int code = 0; /** * The property that enables or disables * garbage collection during reseting. */ private static boolean garbageCollection; /** * Static initialisation of the class properties from * the properties file. It calls the method initialise(). */ static { initialise(); } /** * Initialises the properties from the property file. * The initial capacity of the hash map, is set to the * value of the property termcodes.initialcapacity. * The default value is 3000000. The second property * is related to the method reset() and enables or disables * garbage collection when the reset method is called. * The corresponding property is termcodes.garbagecollect, * and its default property is true. */ public static void initialise() { hashMapCapacity = Integer.parseInt( ApplicationSetup.getProperty("termcodes.initialcapacity", "3000000")); garbageCollection = Boolean.parseBoolean(ApplicationSetup.getProperty("termcodes.garbagecollect","true")); } /** * Returns the code for a given term. * @param term String the term for which * the code will be returned. * @return int the code for the given term */ public final int getCode(final String term) { /* if we have encountered a new term, add it to the * hash map and return the new term code, otherwise * return the already assigned term code */ if ((code = map.get(term)) == 0) map.put(term, (code = ++counter )); return --code; /* NB: because the GNU trove TObjectIntHashMap returns 0 * for not found, we store 1 above the true termcode for * every term. Eg the first term has true termcode 0, but * the value 1 is stored in the map*/ } /** * Resets the hashmap that contains the mapping * from the terms to the term ids. If the property * garbageCollection is true, * then it performs garbage collection in order to * free alocated memory. This method should be * called after the creation of the lexicon. */ @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="DM_GC", justification="Forcing GC is an essential part of releasing" + "memory for further indexing") public void reset() { if (counter == 0) return; map.clear(); if (garbageCollection) System.gc(); counter = 0; code = 0; } /** For when you manually want to set the term for a given term, and you * know that this term and termcodes do NOT exist, then you can use * this method. NB: counter variable above probably needs to be * considered in this method. */ public void setTermCode(final String term, final int termCode) { map.put(term, termCode+1); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy