org.itc.irst.tcc.sre.util.Orthographic Maven / Gradle / Ivy
/*
* Copyright 2005 FBK-irst (http://www.fbk.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.itc.irst.tcc.sre.util;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* TO DO
*
* @author Claudio Giuliano
* @version %I%, %G%
* @since 1.0
*/
public class Orthographic
{
/**
* Define a static logger variable so that it references the
* Logger instance named Orthographic
.
*/
static Logger logger = LoggerFactory.getLogger(Orthographic.class.getName());
//
public static final String WORD_FORM = "_word";
public static final String TYPE = "_type";
public static final String LEMMA = "_lemma";
public static final String PART_OF_SPEECH = "_POS";
public static final String STEM = "_stem";
public static final String UPPER_CASE = "_UPPER";
public static final String LOWER_CASE = "_low";
public static final String CAPITALIZED = "_Cap";
public static final String NUMERIC = "_num";
public static final String ALPHANUMERIC = "_alpha";
public static final String PUNCTUATION = "_punct";
public static final String PARENTHESIS = "_par";
public static final String QUOTE = "_quote";
//
public static final String ALL_DIGITS = "_digs";
public static final String ENDS_WITH_DOTS = "_ewd";
public static final String SINGLE_LETTER = "_sch";
public static final String SINGLE_DIGIT = "_sdig";
public static final String ROMAN_NUMBER = "_romans";
public static final String GREEK_LETTER = "_greeks";
public static final String ALL_CONSONANTS = "_cons";
public static final String ALL_VOWELS = "_vows";
public static final String CONTAINS_DASH = "_dash";
public static final String CONTAINS_SYMBOLS = "_symbs";
public static final String PERCENTAGE = "_perc";
//
public static final String PART = "_part";
//
private static final String[] romans = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"};
//
private static final String[] greekLetters = {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "xi", "omicron", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"};
//
private static final char[] consonants = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z', 'Y'};
//
private static final char[] vowels = {'A', 'E', 'I', 'O', 'U'};
//
public static Object[] characterNGram(String s, int b, int e)
{
List list = new ArrayList();
//logger.info(s + ", " + s.length() + ", " + b + ", " + e);
if (s.length() < b)
return list.toArray();
int begin = 0, end = s.length() - b;
if (end > s.length())
end = s.length();
//logger.info("begin = " + begin + ", end = " + end);
for (int i=begin;i<=end;i++)
{
//logger.info("i = " + i + ", " + s.charAt(i) + ", " + (i+b));
int jend = i + e;
if (jend > s.length())
jend = s.length();
//logger.info("jend = " + jend);
for (int j=i+b;j<=jend;j++)
{
//logger.info("\nj = " + j + ", " + s.charAt(j));
//logger.info(i + ", " + j + ", " + s.substring(i, j).toLowerCase());
list.add("NGRAM(" + s.substring(i, j).toLowerCase() + ")");
}
}
return list.toArray();
} // end prefixes
//
public static Object[] prefixes(String s, int b, int e)
{
List list = new ArrayList();
//logger.info(s + ", " + s.length() + ", " + b + ", " + e);
if (s.length() < b)
return list.toArray();
int end = e + 1, begin = b;
if (end > s.length())
end = s.length();
//logger.info("begin = " + begin + ", end = " + end);
for (int i=begin;i': return true;
case '<': return true;
case '_': return true;
case '\\': return true;
//case '�': return true;
}
return false;
} // end isSymbol
//
public static final boolean containsSymbols(String s)
{
//logger.debug("Orthographic.containsSymbols: " + s);
for(int i=0;i s.length())
e = s.length();
list.add(s.substring(i, e));
} // end for j
} // end for i
return list.toArray();
} // end infix
//
public static final Object[] split(String s)
{
//logger.debug("Orthographic.split: \"" + s + "\"");
List list = new ArrayList();
if (s.length() < 2)
return null;
StringBuffer sb = new StringBuffer();
char x = ' ', y = ' ';
int a = 0, b = 0;
for (int i=0;i 0)
list.add(sb.toString());
//logger.debug("add " + sb);
sb = new StringBuffer();
}
}
}
if (b != Character.DASH_PUNCTUATION)
sb.append(y);
if (sb.length() > 0)
list.add(sb.toString());
//logger.debug("add " + sb);
/*
Iterator it = list.iterator();
while (it.hasNext())
logger.debug(it.next());
*/
if (list.size() > 1)
return list.toArray();
return null;
} // end split
//
public static void main(String args[]) throws Exception
{
if (args.length != 1)
{
System.err.println("Wrong number of parameters " + args.length);
System.err.println("Usage: java org.itc.irst.tcc.ker.data.Orthographic token");
System.exit(-1);
}
String t = args[0];
Orthographic.test(t);
/*
Object[] o = Orthographic.split(t);
if (o != null)
{
for (int i=0;i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy