All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.elasticsearch.index.analysis.CharMatcher Maven / Gradle / Ivy

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.codelibs.elasticsearch.index.analysis;

import java.util.HashSet;
import java.util.Set;

/**
 * A class to match character code points.
 */
public interface CharMatcher {

    class ByUnicodeCategory implements CharMatcher {

        public static CharMatcher of(byte unicodeCategory) {
            return new ByUnicodeCategory(unicodeCategory);
        }

        private final byte unicodeType;

        ByUnicodeCategory(byte unicodeType) {
            this.unicodeType = unicodeType;
        }

        @Override
        public boolean isTokenChar(int c) {
            return Character.getType(c) == unicodeType;
        }
    }

    public enum Basic implements CharMatcher {
        LETTER {
            @Override
            public boolean isTokenChar(int c) {
                return Character.isLetter(c);
            }
        },
        DIGIT {
            @Override
            public boolean isTokenChar(int c) {
                return Character.isDigit(c);
            }
        },
        WHITESPACE {
            @Override
            public boolean isTokenChar(int c) {
                return Character.isWhitespace(c);
            }
        },
        PUNCTUATION {
            @Override
            public boolean isTokenChar(int c) {
                switch (Character.getType(c)) {
                case Character.START_PUNCTUATION:
                case Character.END_PUNCTUATION:
                case Character.OTHER_PUNCTUATION:
                case Character.CONNECTOR_PUNCTUATION:
                case Character.DASH_PUNCTUATION:
                case Character.INITIAL_QUOTE_PUNCTUATION:
                case Character.FINAL_QUOTE_PUNCTUATION:
                    return true;
                default:
                    return false;
                }
            }
        },
        SYMBOL {
            @Override
            public boolean isTokenChar(int c) {
                switch (Character.getType(c)) {
                case Character.CURRENCY_SYMBOL:
                case Character.MATH_SYMBOL:
                case Character.OTHER_SYMBOL:
                case Character.MODIFIER_SYMBOL:
                    return true;
                 default:
                     return false;
                }
            }
        }
    }

    public final class Builder {
        private final Set matchers;
        Builder() {
            matchers = new HashSet<>();
        }
        public Builder or(CharMatcher matcher) {
            matchers.add(matcher);
            return this;
        }
        public CharMatcher build() {
            switch (matchers.size()) {
            case 0:
                return c -> false;
            case 1:
                return matchers.iterator().next();
            default:
                return c -> {
                    for (CharMatcher matcher : matchers) {
                        if (matcher.isTokenChar(c)) {
                            return true;
                        }
                    }
                    return false;
                };
            }
        }
    }

    /** Returns true if, and only if, the provided character matches this character class. */
    boolean isTokenChar(int c);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy