All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.terms.SkipTermPipeline Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is SkipTermPipeline.java.
 *
 * The Original Code is Copyright (C) 2008-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Rodrygo Santos  (original author)
 *   Craig Macdonald 
 *   
 */
package org.terrier.terms;

import gnu.trove.THashSet;
import org.terrier.utility.ApplicationSetup;

/** Class that identified tokens which should not be passed down the entire term pipeline, and instead
  * passed onto a specified stage instead. Tokens are autmatically lowercased if lowercase 
  * is set (as it is by default). If no tokens are specified, then no tokens are omitted from the term
  * pipeline.
  * 

Properties *

  • termpipelines.skip - list of tokens to skip
  • *
  • lowercase - whether tokens should be lowercased
  • *
* @author Rodrygo Santos and Craig Macdonald * @since 2.2 */ public class SkipTermPipeline implements TermPipeline { final TermPipeline next, last; final THashSet skipTerms = new THashSet(); /** Instantiate this object, using properties to define tokens. Skip tokens are * specified as a comma delimited list, using the termpipelines.skip * property. Terms are lowercased if lowercase is set (as it is by default). */ public SkipTermPipeline(TermPipeline _next, TermPipeline _last) { this.next = _next; this.last = _last; String tokens = ApplicationSetup.getProperty("termpipelines.skip", null); if (tokens == null || (tokens = tokens.trim()).length() == 0) { return; } if (Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true"))) tokens = tokens.toLowerCase(); for (String st : tokens.split("\\s*,\\s*")) { skipTerms.add(st); } } /** Instantiate this object. Terms in skipTokens will be passed to the last term pipeline * object instead of the next. */ public SkipTermPipeline(TermPipeline _next, TermPipeline _last, String[] _skipTokens) { this.next = _next; this.last = _last; for (String st : _skipTokens) { skipTerms.add(st); } } /** Processes this token. If is a specified token, then passes it to the last * stage in the pipgeline, instead of onto the next one. * @param term */ public void processTerm(String term) { // if term should be skiped if (skipTerms.contains(term)) { // jump to last termpipeline last.processTerm(term); } else { // proceed to next termpipeline next.processTerm(term); } } /** * Implements the default operation for all TermPipeline subclasses; * By default do nothing. * This method should be overrided by any TermPipeline that want to implements doc/query * oriented lifecycle. * @return return how the reset has gone */ public boolean reset() { return next!=null ? next.reset() : true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy