All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpmml.evaluator.TextUtil Maven / Gradle / Ivy

There is a newer version: 1.7.2
Show newest version
/*
 * Copyright (c) 2017 Villu Ruusmann
 *
 * This file is part of JPMML-Evaluator
 *
 * JPMML-Evaluator is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * JPMML-Evaluator is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with JPMML-Evaluator.  If not, see .
 */
package org.jpmml.evaluator;

import java.util.List;
import java.util.Objects;
import java.util.concurrent.Callable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.cache.Cache;
import com.google.common.collect.Table;
import org.dmg.pmml.InlineTable;
import org.dmg.pmml.PMMLObject;
import org.dmg.pmml.Row;
import org.dmg.pmml.TextIndex;
import org.dmg.pmml.TextIndexNormalization;

public class TextUtil {

	private TextUtil(){
	}

	static
	public String normalize(TextIndex textIndex, String string){

		if(textIndex.hasTextIndexNormalizations()){
			List textIndexNormalizations = textIndex.getTextIndexNormalizations();

			for(TextIndexNormalization textIndexNormalization : textIndexNormalizations){
				string = TextUtil.normalize(textIndex, textIndexNormalization, string);
			}
		}

		return string;
	}

	static
	public String normalize(TextIndex textIndex, TextIndexNormalization textIndexNormalization, String string){
		TextTokenizer tokenizer = null;

		Boolean tokenize = textIndexNormalization.isTokenize();
		if(tokenize == null){
			tokenize = textIndex.isTokenize();
		} // End if

		if(tokenize){
			PMMLObject locatable = textIndexNormalization;

			String wordSeparatorCharacterRE = textIndexNormalization.getWordSeparatorCharacterRE();
			if(wordSeparatorCharacterRE == null){
				locatable = textIndex;

				wordSeparatorCharacterRE = textIndex.getWordSeparatorCharacterRE();
			}

			Pattern pattern = RegExUtil.compile(wordSeparatorCharacterRE, locatable);

			tokenizer = new TextTokenizer(pattern);
		}

		Boolean caseSensitive = textIndexNormalization.isCaseSensitive();
		if(caseSensitive == null){
			caseSensitive = textIndex.isCaseSensitive();
		}

		Integer maxLevenshteinDistance = textIndexNormalization.getMaxLevenshteinDistance();
		if(maxLevenshteinDistance == null){
			maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();

			if(maxLevenshteinDistance < 0){
				throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
			}
		} else

		{
			if(maxLevenshteinDistance < 0){
				throw new InvalidAttributeException(textIndexNormalization, PMMLAttributes.TEXTINDEXNORMALIZATION_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
			}
		}

		InlineTable inlineTable = InlineTableUtil.getInlineTable(textIndexNormalization);
		if(inlineTable != null){
			String inField = textIndexNormalization.getInField();
			String outField = textIndexNormalization.getOutField();
			String regexField = textIndexNormalization.getRegexField();

			normalization:
			while(true){
				String normalizedString;

				try {
					normalizedString = normalize(inlineTable, inField, outField, regexField, string, tokenizer, caseSensitive, maxLevenshteinDistance);
				} catch(PMMLException pe){
					throw pe.ensureContext(textIndexNormalization);
				}

				// "If the recursive flag is set to true, then the normalization table is reapplied until none of its rows causes a change to the input text."
				if(textIndexNormalization.isRecursive()){

					if(!(normalizedString).equals(string)){
						string = normalizedString;

						continue normalization;
					}
				}

				return normalizedString;
			}
		}

		return string;
	}

	static
	String normalize(InlineTable inlineTable, String inColumn, String outColumn, String regexColumn, String string, TextTokenizer tokenizer, boolean caseSensitive, int maxLevenshteinDistance){
		Table table = InlineTableUtil.getContent(inlineTable);

		int regexFlags = (caseSensitive ? 0 : (Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE));

		List rows = inlineTable.getRows();
		for(int i = 0; i < rows.size(); i++){
			Row row = rows.get(i);

			Integer rowKey = (i + 1);

			String inValue = table.get(rowKey, inColumn);
			if(inValue == null){
				throw new InvalidElementException("Cell " + PMMLException.formatKey(inColumn) + " is not defined", row);
			}

			String outValue = table.get(rowKey, outColumn);
			if(outValue == null){
				throw new InvalidElementException("Cell " + PMMLException.formatKey(outColumn) + " is not defined", row);
			}

			String regexValue = table.get(rowKey, regexColumn);

			// "If there is a regexField column and its value for that row is true, then the string in the inField column should be treated as a PCRE regular expression"
			boolean regex = ("true").equalsIgnoreCase(regexValue);
			if(regex){
				Pattern pattern = RegExUtil.compile(inValue, regexFlags, row);

				Matcher matcher = pattern.matcher(string);

				string = matcher.replaceAll(outValue);
			} else

			{
				Pattern pattern = RegExUtil.compile(Pattern.quote(inValue), regexFlags, row);

				Matcher matcher = pattern.matcher(string);

				string = matcher.replaceAll(outValue);
			}
		}

		return string;
	}


	static
	public List tokenize(TextIndex textIndex, String text){
		boolean tokenize = textIndex.isTokenize();

		if(tokenize){
			String wordSeparatorCharacterRE = textIndex.getWordSeparatorCharacterRE();

			Pattern pattern = RegExUtil.compile(wordSeparatorCharacterRE, textIndex);

			TextTokenizer tokenizer = new TextTokenizer(pattern);

			return tokenizer.tokenize(text);
		} else

		{
			throw new UnsupportedAttributeException(textIndex, PMMLAttributes.TEXTINDEX_TOKENIZE, tokenize);
		}
	}

	static
	public int termFrequency(TextIndex textIndex, List textTokens, List termTokens){

		if(textTokens.isEmpty() || termTokens.isEmpty()){
			return 0;
		}

		boolean caseSensitive = textIndex.isCaseSensitive();

		int maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();
		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}

		boolean bestHits;

		TextIndex.CountHits countHits = textIndex.getCountHits();
		switch(countHits){
			case BEST_HITS:
				bestHits = true;
				break;
			case ALL_HITS:
				bestHits = false;
				break;
			default:
				throw new UnsupportedAttributeException(textIndex, countHits);
		}

		int maxFrequency;

		TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
		switch(localTermWeights){
			case BINARY:
				maxFrequency = 1;
				break;
			case TERM_FREQUENCY:
			case LOGARITHMIC:
				maxFrequency = Integer.MAX_VALUE;
				break;
			default:
				throw new UnsupportedAttributeException(textIndex, localTermWeights);
		}

		try {
			return termFrequency(textTokens, termTokens, caseSensitive, maxLevenshteinDistance, bestHits, maxFrequency);
		} catch(PMMLException pe){
			throw pe.ensureContext(textIndex);
		}
	}

	static
	int termFrequency(List textTokens, List termTokens, boolean caseSensitive, int maxLevenshteinDistance, boolean bestHits, int maxFrequency){
		int frequency = 0;

		int bestLevenshteinDistance = Integer.MAX_VALUE;

		int textSize = textTokens.size();
		int termSize = termTokens.size();

		text:
		for(int i = 0, max = (textSize - termSize); i <= max; i++){
			int levenshteinDistance = 0;

			term:
			for(int j = 0; j < termSize; j++){
				int threshold = (maxLevenshteinDistance - levenshteinDistance);

				String textToken = textTokens.get(i + j);
				String termToken = termTokens.get(j);

				if(threshold == 0){
					boolean equals;

					if(caseSensitive){
						equals = (textToken).equals(termToken);
					} else

					{
						equals = (textToken).equalsIgnoreCase(termToken);
					} // End if

					if(!equals){
						continue text;
					}
				} else

				{
					int tokenLevenshteinDistance = LevenshteinDistanceUtil.limitedCompare(textToken, termToken, caseSensitive, threshold);

					if(tokenLevenshteinDistance < 0){
						continue text;
					}

					levenshteinDistance += tokenLevenshteinDistance;
				}
			}

			if(bestHits){

				if(levenshteinDistance < bestLevenshteinDistance){
					frequency = 1;

					bestLevenshteinDistance = levenshteinDistance;
				} else

				if(levenshteinDistance == bestLevenshteinDistance){
					frequency++;
				} else

				{
					continue text;
				} // End if

				if((bestLevenshteinDistance == 0) && (frequency >= maxFrequency)){
					return frequency;
				}
			} else

			{
				frequency++;

				if(frequency >= maxFrequency){
					return frequency;
				}
			}
		}

		return Math.min(maxFrequency, frequency);
	}

	static
	abstract
	class StringProcessor {

		private TextIndex textIndex = null;

		private FieldValue value = null;


		public StringProcessor(TextIndex textIndex, FieldValue value){
			setTextIndex(Objects.requireNonNull(textIndex));
			setValue(Objects.requireNonNull(value));
		}

		abstract
		public List process();

		public TextIndex getTextIndex(){
			return this.textIndex;
		}

		private void setTextIndex(TextIndex textIndex){
			this.textIndex = textIndex;
		}

		public FieldValue getValue(){
			return this.value;
		}

		private void setValue(FieldValue value){
			this.value = value;
		}
	}

	static
	class TextProcessor extends StringProcessor {

		TextProcessor(TextIndex textIndex, FieldValue value){
			super(textIndex, value);
		}

		@Override
		public List process(){
			TextIndex textIndex = getTextIndex();
			FieldValue value = getValue();

			Cache> textTokenCache = CacheUtil.getValue(textIndex, TextUtil.textTokenCaches, TextUtil.textTokenCacheLoader);

			List tokens = textTokenCache.getIfPresent(value);
			if(tokens == null){
				String string = TextUtil.normalize(textIndex, value.asString());

				tokens = TextUtil.tokenize(textIndex, string);

				textTokenCache.put(value, tokens);
			}

			return tokens;
		}
	}

	static
	class TermProcessor extends StringProcessor {

		TermProcessor(TextIndex textIndex, FieldValue value){
			super(textIndex, value);
		}

		@Override
		public List process(){
			TextIndex textIndex = getTextIndex();
			FieldValue value = getValue();

			Cache> termTokenCache = CacheUtil.getValue(textIndex, TextUtil.termTokenCaches, TextUtil.termTokenCacheLoader);

			List tokens = termTokenCache.getIfPresent(value);
			if(tokens == null){
				String string = value.asString();

				tokens = TextUtil.tokenize(textIndex, string);

				termTokenCache.put(value, tokens);
			}

			return tokens;
		}
	}

	private static final Cache>> textTokenCaches = CacheUtil.buildCache();

	private static final Callable>> textTokenCacheLoader = new Callable>>(){

		@Override
		public Cache> call(){
			return CacheUtil.buildCache();
		}
	};

	private static final Cache>> termTokenCaches = CacheUtil.buildCache();

	private static final Callable>> termTokenCacheLoader = new Callable>>(){

		@Override
		public Cache> call(){
			return CacheUtil.buildCache();
		}
	};
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy