All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.wordcount.WordCountStep Maven / Gradle / Ivy

/*===========================================================================
  Copyright (C) 2008-2011 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.wordcount;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.steps.wordcount.common.GMX;
import net.sf.okapi.steps.wordcount.common.Parameters;
import net.sf.okapi.steps.wordcount.common.TokenCountStep;

/**
 * Word Counter pipeline step. The counter counts a number of words in translatable text units. 
 * The count results are placed in a MetricsAnnotation structure (with the GMX TotalWordCount 
 * metric set), attached to the respective event's resource (TEXT_UNIT, END_DOCUMENT, END_BATCH, 
 * END_BATCH_ITEM, END_SUBDOCUMENT, END_GROUP).  
 * 
 * @version 0.1 06.07.2009
 */
@UsingParameters(Parameters.class)
public class WordCountStep extends TokenCountStep {
	
	public static final String METRIC = GMX.TotalWordCount; 
	
	@Override
	public String getMetric() {
		return METRIC;
	}

	@Override
	protected String[] getTokenNames() {
		return new String[]{"WORD", "HYPHENATED_WORD", "NUMBER"};
	}

	@Override
	protected boolean countOnlyTranslatable() {
		return false;
	}

	@Override
	public String getDescription() {
		return "Count the number of words in the text units of a set of documents or/and in its parts."
		+ " Expects: filter events. Sends back: filter events.";
	}

	@Override
	public String getName() {
		return "Word Count";
	}
	
	@Override
	protected long count(Segment segment, LocaleId locale) {
		if (GMX.isLogographicScript(locale)) {
			return WordCounter.countLogographicScript(segment, locale);
		} else {
			return super.count(segment, locale);
		}
	}
	
	@Override
	protected long count(TextContainer textContainer, LocaleId locale) {
		if (GMX.isLogographicScript(locale)) {
			return WordCounter.countLogographicScript(textContainer, locale);
		} else {
			return super.count(textContainer, locale);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy