All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.itc.irst.tcc.sre.util.Orthographic Maven / Gradle / Ivy

/*
 * Copyright 2005 FBK-irst (http://www.fbk.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.itc.irst.tcc.sre.util;

import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * TO DO
 *
 * @author 	Claudio Giuliano
 * @version %I%, %G%
 * @since		1.0
 */
public class Orthographic
{
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named Orthographic.
	 */
	static Logger logger = LoggerFactory.getLogger(Orthographic.class.getName());

	//
	public static final String WORD_FORM = "_word";
	public static final String TYPE = "_type";
	public static final String LEMMA = "_lemma";
	public static final String PART_OF_SPEECH = "_POS";
	public static final String STEM = "_stem";
	public static final String UPPER_CASE = "_UPPER";
	public static final String LOWER_CASE = "_low";
	public static final String CAPITALIZED = "_Cap";
	public static final String NUMERIC = "_num";
	public static final String ALPHANUMERIC = "_alpha";
	public static final String PUNCTUATION = "_punct";
	public static final String PARENTHESIS = "_par";
	public static final String QUOTE = "_quote";

	//
	public static final String ALL_DIGITS = "_digs";
	public static final String ENDS_WITH_DOTS = "_ewd";
	public static final String SINGLE_LETTER = "_sch";
	public static final String SINGLE_DIGIT = "_sdig";
	public static final String ROMAN_NUMBER = "_romans";
	public static final String GREEK_LETTER = "_greeks";
	public static final String ALL_CONSONANTS = "_cons";
	public static final String ALL_VOWELS = "_vows";
	public static final String CONTAINS_DASH = "_dash";
	public static final String CONTAINS_SYMBOLS = "_symbs";
	public static final String PERCENTAGE = "_perc";

	//
	public static final String PART = "_part";

	//
	private static final String[] romans = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"};

	//
	private static final String[] greekLetters = {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "xi", "omicron", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"};

	//
	private static final char[] consonants = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z', 'Y'};

	//
	private static final char[] vowels = {'A', 'E', 'I', 'O', 'U'};

	//
	public static Object[] characterNGram(String s, int b, int e)
	{
		List list = new ArrayList();

		//logger.info(s + ", " + s.length() + ", " + b + ", " + e);

		if (s.length() < b)
			return list.toArray();

		int begin = 0, end = s.length() - b;

		if (end > s.length())
			end = s.length();
		//logger.info("begin = " + begin + ", end = " + end);

		for (int i=begin;i<=end;i++)
		{
			//logger.info("i = " + i + ", " + s.charAt(i) + ", " + (i+b));

			int jend = i + e;
			if (jend > s.length())
				jend = s.length();

			//logger.info("jend = " + jend);
			for (int j=i+b;j<=jend;j++)
			{
				//logger.info("\nj = " + j + ", " + s.charAt(j));
				//logger.info(i + ", " + j + ", " + s.substring(i, j).toLowerCase());
				list.add("NGRAM(" + s.substring(i, j).toLowerCase() + ")");
			}
		}

		return list.toArray();
	} // end prefixes

	//
	public static Object[] prefixes(String s, int b, int e)
	{
		List list = new ArrayList();

		//logger.info(s + ", " + s.length() + ", " + b + ", " + e);

		if (s.length() < b)
			return list.toArray();

		int end = e + 1, begin = b;

		if (end > s.length())
			end = s.length();

		//logger.info("begin = " + begin + ", end = " + end);

		for (int i=begin;i': return true;
			case '<': return true;
			case '_': return true;
			case '\\': return true;
			//case '�': return true;
		}

		return false;
	} // end isSymbol

	//
	public static final boolean containsSymbols(String s)
	{
		//logger.debug("Orthographic.containsSymbols: " + s);

		for(int i=0;i s.length())
					e = s.length();

				list.add(s.substring(i, e));
			} // end for j
		} // end for i

		return list.toArray();
	} // end infix


	//
	public static final Object[] split(String s)
	{
		//logger.debug("Orthographic.split: \"" + s + "\"");
		List list = new ArrayList();

		if (s.length() < 2)
			return null;

		StringBuffer sb = new StringBuffer();

		char x = ' ', y = ' ';
		int a = 0, b = 0;

		for (int i=0;i 0)
						list.add(sb.toString());
					//logger.debug("add " + sb);
					sb = new StringBuffer();
				}
			}
		}

		if (b != Character.DASH_PUNCTUATION)
			sb.append(y);

		if (sb.length() > 0)
			list.add(sb.toString());

		//logger.debug("add " + sb);
		/*
		Iterator it = list.iterator();
		while (it.hasNext())
			logger.debug(it.next());
		*/

		if (list.size() > 1)
			return list.toArray();

		return null;
	} // end split

	//
	public static void main(String args[]) throws Exception
	{
		

		if (args.length != 1)
		{
			System.err.println("Wrong number of parameters " + args.length);
			System.err.println("Usage: java org.itc.irst.tcc.ker.data.Orthographic token");
			System.exit(-1);
		}

		String t = args[0];

		Orthographic.test(t);
	/*
		Object[] o = Orthographic.split(t);

		if (o != null)
		{
			for (int i=0;i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy