All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.codec.language.bm.Lang Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.codec.language.bm;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * 

* Language guessing utility. *

*

* This class encapsulates rules used to guess the possible languages that a word originates from. This is done by reference to a whole * series of rules distributed in resource files. *

*

* Instances of this class are typically managed through the static factory method instance(). Unless you are developing your own language * guessing rules, you will not need to interact with this class directly. *

*

* This class is intended to be immutable and thread-safe. *

*

Lang resources

* Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named * following the pattern:
org/apache/commons/codec/language/bm/lang.txt
The format of these resources is the * following: *

*
    *
  • Rules: whitespace separated strings. There should be 3 columns to each row, and these will be interpreted as: *
      *
    1. pattern: a regular expression.
    2. *
    3. languages: a '+'-separated list of languages.
    4. *
    5. acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.
    6. *
    *
  • *
  • End-of-line comments: Any occurance of '//' will cause all text following on that line to be discarded as a comment.
  • *
  • Multi-line comments: Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a * line ending in '*' and '/' is found.
  • *
  • Blank lines: All blank lines will be skipped.
  • *
*

* Port of lang.php * * @author Apache Software Foundation * @since 1.6 */ public class Lang { // Implementation note: This class is divided into two sections. The first part is a static factory interface that // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that // encapsulate a particular language-guessing rule table and the language guessing itself. // // It may make sense in the future to expose the private constructor to allow power users to build custom language- // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users // should be strongly encouraged to use the static factory instance method to get their Lang instances. private static final class LangRule { private final boolean acceptOnMatch; private final Set languages; private final Pattern pattern; private LangRule(Pattern pattern, Set languages, boolean acceptOnMatch) { this.pattern = pattern; this.languages = languages; this.acceptOnMatch = acceptOnMatch; } public boolean matches(String txt) { return this.pattern.matcher(txt).find(); } } private static final Map Langs = new EnumMap(NameType.class); private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt"; static { for (NameType s : NameType.values()) { Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s))); } } /** * Gets a Lang instance for one of the supported NameTypes. * * @param nameType * the NameType to look up * @return a Lang encapsulating the language guessing rules for that name type */ public static Lang instance(NameType nameType) { return Langs.get(nameType); } /** *

* Loads language rules from a resource. *

*

* In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. You will only need to call this * yourself if you are developing custom language mapping rules. *

* * @param languageRulesResourceName * the fully-qualified resource name to load * @param languages * the languages that these rules will support * @return a Lang encapsulating the loaded language-guessing rules. */ public static Lang loadFromResource(String languageRulesResourceName, Languages languages) { List rules = new ArrayList(); InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName); if (lRulesIS == null) { throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN); } Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING); boolean inExtendedComment = false; while (scanner.hasNextLine()) { String rawLine = scanner.nextLine(); String line = rawLine; if (inExtendedComment) { if (line.endsWith(ResourceConstants.EXT_CMT_END)) { inExtendedComment = false; } else { // discard doc comment line } } else { if (line.startsWith(ResourceConstants.EXT_CMT_START)) { inExtendedComment = true; } else { // discard comments int cmtI = line.indexOf(ResourceConstants.CMT); if (cmtI >= 0) { // System.err.println("index of comment: " + cmtI); line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.length() == 0) { continue; // empty lines can be safely skipped } // split it up String[] parts = line.split("\\s+"); // System.err.println("part count: " + parts.length); if (parts.length != 3) { // fixme: we really need to log this somewhere System.err.println("Warning: malformed line '" + rawLine + "'"); continue; } Pattern pattern = Pattern.compile(parts[0]); String[] langs = parts[1].split("\\+"); boolean accept = parts[2].equals("true"); rules.add(new LangRule(pattern, new HashSet(Arrays.asList(langs)), accept)); } } } return new Lang(rules, languages); } private final Languages languages; private final List rules; private Lang(List rules, Languages languages) { this.rules = Collections.unmodifiableList(rules); this.languages = languages; } /** * Guesses the language of a word. * * @param text * the word * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match */ public String guessLanguage(String text) { Languages.LanguageSet ls = guessLanguages(text); return ls.isSingleton() ? ls.getAny() : Languages.ANY; } /** * Guesses the languages of a word. * * @param input * the word * @return a Set of Strings of language names that are potential matches for the input word */ public Languages.LanguageSet guessLanguages(String input) { String text = input.toLowerCase(Locale.ENGLISH); // System.out.println("Testing text: '" + text + "'"); Set langs = new HashSet(this.languages.getLanguages()); for (LangRule rule : this.rules) { if (rule.matches(text)) { // System.out.println("Rule " + rule.pattern + " matches " + text); if (rule.acceptOnMatch) { // System.out.println("Retaining " + rule.languages); langs.retainAll(rule.languages); } else { // System.out.println("Removing " + rule.languages); langs.removeAll(rule.languages); } // System.out.println("Current languages: " + langs); } else { // System.out.println("Rule " + rule.pattern + " does not match " + text); } } Languages.LanguageSet ls = Languages.LanguageSet.from(langs); return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy