All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tupilabs.human_name_parser.HumanNameParserParser Maven / Gradle / Ivy

Go to download

HumanNameParser.java, a Java port of HumanNameParser.php. Parser for human names in Java, all credit goes to @jasonpriem

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2010-2015 Jason Priem, Bruno P. Kinoshita
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.tupilabs.human_name_parser;

import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;

/**
 * 

A parser capable of parsing name parts out of a single string.

* *

The code works by basically applying several Regexes in a certain order * and removing (chopping) tokens off the original string. The parser consumes * the tokens during its creation.

* *

This class is not thread-safe.

* * @since 0.1 */ public class HumanNameParserParser { private Name name; private String leadingInit; private String first; private String nicknames; private String middle; private String last; private String suffix; private String salutation; private String postnominal; private List suffixes; private List salutations; private List prefixes; private List postnominals; /** * Creates a parser given a string name. * * @param name string name */ public HumanNameParserParser(String name) { this(new Name(name)); } /** * Creates a parser given a {@code Name} object. * * @param name {@code Name} */ public HumanNameParserParser(Name name) { this.name = name; this.leadingInit = ""; this.first = ""; this.nicknames = ""; this.middle = ""; this.last = ""; this.suffix = ""; this.salutation = ""; this.postnominal = ""; this.salutations = Arrays.asList(new String[] {"mr", "master", "mister", "mrs", "miss", "ms", "dr", "prof", "rev", "fr", "judge", "honorable", "hon" }); this.suffixes = Arrays.asList(new String[] { "jr", "sr", "2", "ii", "iii", "iv", "v", "senior", "junior" }); this.postnominals = Arrays.asList(new String[] { "phd", "ph.d.", "ph.d", "esq", "esquire", "apr", "rph", "pe", "md", "ma", "dmd", "cme", "dds", "cpa", "dvm" }); this.prefixes = Arrays.asList(new String[] { "bar", "ben", "bin", "da", "dal", "de la", "de", "del", "der", "di", "ibn", "la", "le", "san", "st", "ste", "van", "van der", "van den", "vel", "von" }); this.parse(); } /** * Gets the {@code Name} object. * @return the {@code Name} object */ public Name getName() { return name; } public String getLeadingInit() { return leadingInit; } public String getFirst() { return first; } public String getNicknames() { return nicknames; } public String getMiddle() { return middle; } public String getLast() { return last; } public String getSuffix() { return suffix; } public String getPostnominal() { return postnominal; } public String getSalutation() { return salutation; } public List getSuffixes() { return suffixes; } public List getPostnominals() { return postnominals; } public List getSalutations() { return salutations; } public List getPrefixes() { return prefixes; } /** * Consumes the string and creates the name parts. * * @throws ParseException if the parser fails to retrieve the name parts */ private void parse() throws ParseException { String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*"; String postnominals = StringUtils.join(this.postnominals, "\\.*|") + "\\.*"; String salutations = StringUtils.join(this.salutations, "\\.*|") + "\\.*"; String prefixes = StringUtils.join(this.prefixes, " |") + " "; // The regex use is a bit tricky. *Everything* matched by the regex will be replaced, // but you can select a particular parenthesized submatch to be returned. // Also, note that each regex requires that the preceding ones have been run, and matches chopped out. String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) "; // names that starts or end w/ an apostrophe break this String suffixRegex = "(?i)[,| ]+(("+suffixes+")$)"; String postnominalRegex = "(?i)[,| ]+(("+postnominals+")$)"; String lastRegex = "(?i)(?!^)\\b([^ ]+ y |"+prefixes+")*[^ ]+$"; String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))"; // note the lookahead, which isn't returned or replaced String salutationsRegex = "(?i)^("+salutations+"\\b)(\\.|\\s)+"; //salutation plus a word boundary \b String firstRegex = "(?i)^([^ ]+)"; // get nickname, if there is one this.nicknames = this.name.chopWithRegex(nicknamesRegex, 2); // get postnominal, if there is one this.postnominal = this.name.chopWithRegex(postnominalRegex, 1); // get suffix, if there is one this.suffix = this.name.chopWithRegex(suffixRegex, 1); // flip the before-comma and after-comma parts of the name this.name.flip(","); // get the last name this.last = this.name.chopWithRegex(lastRegex, 0); if (StringUtils.isBlank(this.last)) { throw new ParseException("Couldn't find a last name in '{" + this.name.getStr() + "}'."); } // get salutation, if there is one this.salutation = this.name.chopWithRegex(salutationsRegex, 1); // get the first initial, if there is one this.leadingInit = this.name.chopWithRegex(leadingInitRegex, 1); // get the first name this.first = this.name.chopWithRegex(firstRegex, 0); if (StringUtils.isBlank(this.first)) { throw new ParseException("Couldn't find a first name in '{" + this.name.getStr() + "}'"); } // if anything's left, that's the middle name this.middle = this.name.getStr(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy