All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.namefind.RegexNameFinderFactory Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.namefind;

import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;

/**
 *
 * Returns a {@link RegexNameFinder} based on a selection of
 * defaults or a configuration and a selection of defaults.
 */
public class RegexNameFinderFactory {

  /**
   * Allows for use of selected Defaults as well as regexes from external
   * configuration.
   *
   * @param config   A {@link Map} where the key is a type, and the value is a
   *                 {@link Pattern[]}. If a key clashes with one of the default keys,
   *                 the config map entry will be taken.
   * @param defaults One or more of the default {@link DEFAULT_REGEX_NAME_FINDER} enum values.
   * @return A {@link RegexNameFinder} instance.
   */
  public static synchronized RegexNameFinder getDefaultRegexNameFinders(
      Map config, DEFAULT_REGEX_NAME_FINDER... defaults) {
    Objects.requireNonNull(config, "config must not be null");

    Map defaultsToMap = new HashMap<>();
    if (defaults != null) {
      defaultsToMap = defaultsToMap(defaults);
    }
    defaultsToMap.putAll(config);
    return new RegexNameFinder(defaultsToMap);
  }

  /**
   * Retrieves a {@link RegexNameFinder} that will utilize specified default regexes.
   *
   * @param defaults One or more of the default {@link DEFAULT_REGEX_NAME_FINDER} enum values.
   * @return A {@link RegexNameFinder} instance.
   */
  public static synchronized RegexNameFinder getDefaultRegexNameFinders(
      DEFAULT_REGEX_NAME_FINDER... defaults) {
    Objects.requireNonNull(defaults, "defaults must not be null");
    return new RegexNameFinder(defaultsToMap(defaults));
  }

  private synchronized static Map defaultsToMap(
      DEFAULT_REGEX_NAME_FINDER... defaults) {
    Map regexMap = new HashMap<>();
    for (DEFAULT_REGEX_NAME_FINDER def : defaults) {
      regexMap.putAll(def.getRegexMap());
    }
    return regexMap;
  }

  public interface RegexAble {

    Map getRegexMap();

    String getType();
  }

  /**
   * Enumeration of typical regex expressions available in OpenNLP.
   */
  public enum DEFAULT_REGEX_NAME_FINDER implements RegexAble {

    USA_PHONE_NUM {
      @Override
      public Map getRegexMap() {
        Pattern[] p = new Pattern[1];
        // p[0] = Pattern.compile("([\\+(]?(\\d){2,}[)]?[- \\.]?(\\d){2,}[- \\.]?(\\d){2,}[- \\.]?
        // (\\d){2,}[- \\.]?(\\d){2,})|([\\+(]?(\\d){2,}[)]?[- \\.]?(\\d){2,}[- \\.]?(\\d){2,}[-
        // \\.]?(\\d){2,})|([\\+(]?(\\d){2,}[)]?[- \\.]?(\\d){2,}[- \\.]?(\\d){2,})",
        // Pattern.CASE_INSENSITIVE);
        p[0] = Pattern.compile("((\\(\\d{3}\\) ?)|(\\d{3}-))?\\d{3}-\\d{4}");
        Map regexMap = new HashMap<>();
        regexMap.put(getType(), p);
        return regexMap;
      }

      @Override
      public String getType() {
        return "PHONE_NUM";
      }
    },
    EMAIL {
      @Override
      public Map getRegexMap() {
        Pattern[] p = new Pattern[1];
        p[0] = Pattern.compile("([a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*" +
            "|\"([\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09" +
            "\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9]([a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]" +
            "*[a-z0-9])?|\\[((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]" +
            "?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]" +
            "|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", Pattern.CASE_INSENSITIVE);
        Map regexMap = new HashMap<>();
        regexMap.put(getType(), p);
        return regexMap;
      }

      @Override
      public String getType() {
        return "EMAIL";
      }
    },
    URL {
      @Override
      public Map getRegexMap() {
        Pattern[] p = new Pattern[1];
        p[0] = Pattern.compile("\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)"
            + "(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov"
            + "|mil|biz|info|mobi|name|aero|jobs|museum"
            + "|travel|[a-z]{2}))(:[\\d]{1,5})?"
            + "(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?"
            + "((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?"
            + "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)"
            + "(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?"
            + "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*"
            + "(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b", Pattern.CASE_INSENSITIVE);
        Map regexMap = new HashMap<>();
        regexMap.put(getType(), p);
        return regexMap;
      }

      @Override
      public String getType() {
        return "URL";
      }
    },
    MGRS {
      @Override
      public Map getRegexMap() {
        Pattern[] p = new Pattern[1];
        p[0] = Pattern.compile("\\d{1,2}[A-Za-z]\\s*[A-Za-z]{2}\\s*\\d{1,5}\\s*\\d{1,5}",
            Pattern.CASE_INSENSITIVE);
        Map regexMap = new HashMap<>();
        regexMap.put(getType(), p);
        return regexMap;
      }

      @Override
      public String getType() {
        return "MGRS";
      }
    },
    DEGREES_MIN_SEC_LAT_LON {
      @Override
      public Map getRegexMap() {
        Pattern[] p = new Pattern[1];
        p[0] = Pattern.compile("([-|\\+]?\\d{1,3}[d|D|\\u00B0|\\s](\\s*\\d{1,2}['|\\u2019|\\s])" +
            "?(\\s*\\d{1,2}[\\\"|\\u201d])?\\s*[N|n|S|s]?)(\\s*|,|,\\s*)([-|\\+]?\\d{1,3}[d|D|\\u00B0|" +
            "\\s](\\s*\\d{1,2}['|\\u2019|\\s])?(\\s*\\d{1,2}[\\\"|\\u201d])?\\s*[E|e|W|w]?)",
            Pattern.CASE_INSENSITIVE);
        Map regexMap = new HashMap<>();
        regexMap.put(getType(), p);
        return regexMap;
      }

      @Override
      public String getType() {
        return "DEGREES_MIN_SEC_LAT_LON";
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy