All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.i18n.phonenumbers.PhoneNumberMatcher Maven / Gradle / Ivy

There is a newer version: 8.13.52
Show newest version
/*
 * Copyright (C) 2011 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.i18n.phonenumbers;

import com.google.i18n.phonenumbers.PhoneNumberUtil.Leniency;
import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber;

import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}.
 * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in
 * {@link PhoneNumberUtil}.
 *
 * 

Vanity numbers (phone numbers using alphabetic digits such as 1-800-SIX-FLAGS are * not found. * *

This class is not thread-safe. * * @author Tom Hofmann */ final class PhoneNumberMatcher implements Iterator { /** * The phone number pattern used by {@link #find}, similar to * {@code PhoneNumberUtil.VALID_PHONE_NUMBER}, but with the following differences: *

    *
  • All captures are limited in order to place an upper bound to the text matched by the * pattern. *
      *
    • Leading punctuation / plus signs are limited. *
    • Consecutive occurrences of punctuation are limited. *
    • Number of digits is limited. *
    *
  • No whitespace is allowed at the start or end. *
  • No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported. *
*/ private static final Pattern PATTERN; /** * A phone number pattern that does not allow whitespace as punctuation. This pattern is only used * in a second attempt to find a phone number occurring in the context of other numbers, such as * when the preceding or following token is a zip code. */ private static final Pattern INNER; /** * Matches strings that look like publication pages. Example: *
Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
   * Chen Li. VLDB J. 12(3): 211-227 (2003).
* * The string "211-227 (2003)" is not a telephone number. */ private static final Pattern PUB_PAGES = Pattern.compile("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"); static { /* Builds the PATTERN and INNER regular expression patterns. The building blocks below * exist to make the patterns more easily understood. */ /* Limit on the number of leading (plus) characters. */ String leadLimit = limit(0, 2); /* Limit on the number of consecutive punctuation characters. */ String punctuationLimit = limit(0, 4); /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a * single block, set high enough to accommodate the entire national number and the international * country code. */ int digitBlockLimit = PhoneNumberUtil.MAX_LENGTH_FOR_NSN + PhoneNumberUtil.MAX_LENGTH_COUNTRY_CODE; /* Limit on the number of blocks separated by punctuation. Use digitBlockLimit since in some * formats use spaces to separate each digit. */ String blockLimit = limit(0, digitBlockLimit); /* Same as {@link PhoneNumberUtil#VALID_PUNCTUATION} but without space characters. */ String nonSpacePunctuationChars = removeSpace(PhoneNumberUtil.VALID_PUNCTUATION); /* A punctuation sequence without white space. */ String nonSpacePunctuation = "[" + nonSpacePunctuationChars + "]" + punctuationLimit; /* A punctuation sequence allowing white space. */ String punctuation = "[" + PhoneNumberUtil.VALID_PUNCTUATION + "]" + punctuationLimit; /* A digits block without punctuation. */ String digitSequence = "\\p{Nd}" + limit(1, digitBlockLimit); /* Punctuation that may be at the start of a phone number - brackets and plus signs. */ String leadClass = "[(\\[" + PhoneNumberUtil.PLUS_CHARS + "]"; /* Phone number pattern allowing optional punctuation. */ PATTERN = Pattern.compile( "(?:" + leadClass + punctuation + ")" + leadLimit + digitSequence + "(?:" + punctuation + digitSequence + ")" + blockLimit + "(?:" + PhoneNumberUtil.KNOWN_EXTN_PATTERNS + ")?", PhoneNumberUtil.REGEX_FLAGS); /* Phone number pattern with no whitespace allowed. */ INNER = Pattern.compile( leadClass + leadLimit + digitSequence + "(?:" + nonSpacePunctuation + digitSequence + ")" + blockLimit, PhoneNumberUtil.REGEX_FLAGS); } /** Returns a regular expression quantifier with an upper and lower limit. */ private static String limit(int lower, int upper) { if ((lower < 0) || (upper <= 0) || (upper < lower)) { throw new IllegalArgumentException(); } return "{" + lower + "," + upper + "}"; } /** * Returns a copy of {@code characters} with any {@linkplain Character#isSpaceChar space} * characters removed. */ private static String removeSpace(String characters) { StringBuilder builder = new StringBuilder(characters.length()); int i = 0; while (i < characters.length()) { int codePoint = characters.codePointAt(i); if (!Character.isSpaceChar(codePoint)) { builder.appendCodePoint(codePoint); } i += Character.charCount(codePoint); } return builder.toString(); } /** The potential states of a PhoneNumberMatcher. */ private enum State { NOT_READY, READY, DONE } /** The phone number utility. */ private final PhoneNumberUtil util; /** The text searched for phone numbers. */ private final CharSequence text; /** * The region (country) to assume for phone numbers without an international prefix, possibly * null. */ private final String preferredRegion; /** The degree of validation requested. */ private final Leniency leniency; /** The maximum number of retries after matching an invalid number. */ private long maxTries; /** The iteration tristate. */ private State state = State.NOT_READY; /** The last successful match, null unless in {@link State#READY}. */ private PhoneNumberMatch lastMatch = null; /** The next index to start searching at. Undefined in {@link State#DONE}. */ private int searchIndex = 0; /** * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a * new instance. * * @param util the phone number util to use * @param text the character sequence that we will search, null for no text * @param country the ISO 3166-1 two-letter country code indicating the country to assume for * phone numbers not written in international format (with a leading plus, or * with the international dialing prefix of the specified region). May be null or * "ZZ" if only numbers with a leading plus should be considered. * @param leniency the leniency to use when evaluating candidate phone numbers * @param maxTries the maximum number of invalid numbers to try before giving up on the text. * This is to cover degenerate cases where the text has a lot of false positives * in it. Must be {@code >= 0}. */ PhoneNumberMatcher(PhoneNumberUtil util, CharSequence text, String country, Leniency leniency, long maxTries) { if ((util == null) || (leniency == null)) { throw new NullPointerException(); } if (maxTries < 0) { throw new IllegalArgumentException(); } this.util = util; this.text = (text != null) ? text : ""; this.preferredRegion = country; this.leniency = leniency; this.maxTries = maxTries; } public boolean hasNext() { if (state == State.NOT_READY) { lastMatch = find(searchIndex); if (lastMatch == null) { state = State.DONE; } else { searchIndex = lastMatch.end(); state = State.READY; } } return state == State.READY; } public PhoneNumberMatch next() { // Check the state and find the next match as a side-effect if necessary. if (!hasNext()) { throw new NoSuchElementException(); } // Don't retain that memory any longer than necessary. PhoneNumberMatch result = lastMatch; lastMatch = null; state = State.NOT_READY; return result; } /** * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex} * that represents a phone number. Returns the next match, null if none was found. * * @param index the search index to start searching at * @return the phone number match found, null if none can be found */ private PhoneNumberMatch find(int index) { Matcher matcher = PATTERN.matcher(text); while ((maxTries > 0) && matcher.find(index)) { int start = matcher.start(); CharSequence candidate = text.subSequence(start, matcher.end()); // Check for extra numbers at the end. // TODO: This is the place to start when trying to support extraction of multiple phone number // from split notations (+41 79 123 45 67 / 68). candidate = trimAfterFirstMatch(PhoneNumberUtil.SECOND_NUMBER_START_PATTERN, candidate); PhoneNumberMatch match = extractMatch(candidate, start); if (match != null) { return match; } index = start + candidate.length(); maxTries--; } return null; } /** * Trims away any characters after the first match of {@code pattern} in {@code candidate}, * returning the trimmed version. */ private static CharSequence trimAfterFirstMatch(Pattern pattern, CharSequence candidate) { Matcher trailingCharsMatcher = pattern.matcher(candidate); if (trailingCharsMatcher.find()) { candidate = candidate.subSequence(0, trailingCharsMatcher.start()); } return candidate; } /** * Attempts to extract a match from a {@code candidate} character sequence. * * @param candidate the candidate text that might contain a phone number * @param offset the offset of {@code candidate} within {@link #text} * @return the match found, null if none can be found */ private PhoneNumberMatch extractMatch(CharSequence candidate, int offset) { // Skip a match that is more likely a publication page reference. if (PUB_PAGES.matcher(candidate).find()) { return null; } // Try to come up with a valid match given the entire candidate. String rawString = candidate.toString(); PhoneNumberMatch match = parseAndVerify(rawString, offset); if (match != null) { return match; } // If that failed, try to find an inner match without white space. return extractInnerMatch(rawString, offset); } /** * Attempts to extract a match from {@code candidate} using the {@link #INNER} pattern. * * @param candidate the candidate text that might contain a phone number * @param offset the offset of {@code candidate} within {@link #text} * @return the match found, null if none can be found */ private PhoneNumberMatch extractInnerMatch(String candidate, int offset) { int index = 0; Matcher matcher = INNER.matcher(candidate); while ((maxTries > 0) && matcher.find(index)) { String innerCandidate = candidate.substring(matcher.start(), matcher.end()); PhoneNumberMatch match = parseAndVerify(innerCandidate, offset + matcher.start()); if (match != null) { return match; } maxTries--; index = matcher.end(); } return null; } /** * Parses a phone number from the {@code candidate} using {@link PhoneNumberUtil#parse} and * verifies it matches the requested {@link #leniency}. If parsing and verification succeed, a * corresponding {@link PhoneNumberMatch} is returned, otherwise this method returns null. * * @param candidate the candidate match * @param offset the offset of {@code candidate} within {@link #text} * @return the parsed and validated phone number match, or null */ private PhoneNumberMatch parseAndVerify(String candidate, int offset) { try { PhoneNumber number = util.parse(candidate, preferredRegion); if (leniency.verify(number, util)) { return new PhoneNumberMatch(offset, candidate, number); } } catch (NumberParseException e) { // ignore and continue } return null; } /** * Always throws {@link UnsupportedOperationException} as removal is not supported. */ public void remove() { throw new UnsupportedOperationException(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy