All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.cz.CzechStemmer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.cz;


import static org.apache.lucene.analysis.util.StemmerUtil.*;

/**
 * Light Stemmer for Czech.
 * 

* Implements the algorithm described in: * * Indexing and stemming approaches for the Czech language * * http://portal.acm.org/citation.cfm?id=1598600 *

*/ public class CzechStemmer { /** * Stem an input buffer of Czech text. * * @param s input buffer * @param len length of input buffer * @return length of input buffer after normalization * *

NOTE: Input is expected to be in lowercase, * but with diacritical marks

*/ public int stem(char s[], int len) { len = removeCase(s, len); len = removePossessives(s, len); if (len > 0) { len = normalize(s, len); } return len; } private int removeCase(char s[], int len) { if (len > 7 && endsWith(s, len, "atech")) return len - 5; if (len > 6 && (endsWith(s, len,"ětem") || endsWith(s, len,"etem") || endsWith(s, len,"atům"))) return len - 4; if (len > 5 && (endsWith(s, len, "ech") || endsWith(s, len, "ich") || endsWith(s, len, "ích") || endsWith(s, len, "ého") || endsWith(s, len, "ěmi") || endsWith(s, len, "emi") || endsWith(s, len, "ému") || endsWith(s, len, "ěte") || endsWith(s, len, "ete") || endsWith(s, len, "ěti") || endsWith(s, len, "eti") || endsWith(s, len, "ího") || endsWith(s, len, "iho") || endsWith(s, len, "ími") || endsWith(s, len, "ímu") || endsWith(s, len, "imu") || endsWith(s, len, "ách") || endsWith(s, len, "ata") || endsWith(s, len, "aty") || endsWith(s, len, "ých") || endsWith(s, len, "ama") || endsWith(s, len, "ami") || endsWith(s, len, "ové") || endsWith(s, len, "ovi") || endsWith(s, len, "ými"))) return len - 3; if (len > 4 && (endsWith(s, len, "em") || endsWith(s, len, "es") || endsWith(s, len, "ém") || endsWith(s, len, "ím") || endsWith(s, len, "ům") || endsWith(s, len, "at") || endsWith(s, len, "ám") || endsWith(s, len, "os") || endsWith(s, len, "us") || endsWith(s, len, "ým") || endsWith(s, len, "mi") || endsWith(s, len, "ou"))) return len - 2; if (len > 3) { switch (s[len - 1]) { case 'a': case 'e': case 'i': case 'o': case 'u': case 'ů': case 'y': case 'á': case 'é': case 'í': case 'ý': case 'ě': return len - 1; } } return len; } private int removePossessives(char s[], int len) { if (len > 5 && (endsWith(s, len, "ov") || endsWith(s, len, "in") || endsWith(s, len, "ův"))) return len - 2; return len; } private int normalize(char s[], int len) { if (endsWith(s, len, "čt")) { // čt -> ck s[len - 2] = 'c'; s[len - 1] = 'k'; return len; } if (endsWith(s, len, "št")) { // št -> sk s[len - 2] = 's'; s[len - 1] = 'k'; return len; } switch(s[len - 1]) { case 'c': // [cč] -> k case 'č': s[len - 1] = 'k'; return len; case 'z': // [zž] -> h case 'ž': s[len - 1] = 'h'; return len; } if (len > 1 && s[len - 2] == 'e') { s[len - 2] = s[len - 1]; // e* > * return len - 1; } if (len > 2 && s[len - 2] == 'ů') { s[len - 2] = 'o'; // *ů* -> *o* return len; } return len; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy