All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.djl.modality.nlp.NlpUtils Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
 * with the License. A copy of the License is located at
 *
 * http://aws.amazon.com/apache2.0/
 *
 * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
 * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */
package ai.djl.modality.nlp;

import java.util.regex.Pattern;

/** Utility functions for processing String and Characters in NLP problems. */
public final class NlpUtils {

    private NlpUtils() {}

    /**
     * Check whether a character is is considered as a whitespace.
     *
     * 

tab, newline and unicode space characters are all considered as whitespace. * * @param c input character to be checked. * @return whether a character is considered as a whitespace */ public static boolean isWhiteSpace(char c) { return Character.isWhitespace(c) || Character.isSpaceChar(c); } /** * Check whether a character is is considered as a control character. * *

tab, newline and ios control characters are all considered as control character. * * @param c input character to be checked. * @return whether a character is considered as control character */ public static boolean isControl(char c) { if (c == '\t' || c == '\n' || c == '\r') { return false; } return Character.isISOControl(c); } /** * Check whether a character is considered as a punctuation. * *

We treat all non-letter/number ASCII as punctuation. Characters such as "^", "$", and "`" * are not in the Unicode Punctuation class but we treat them as punctuation anyways, for * consistency. * * @param c input character to be checked * @return whether the character is considered as a punctuation */ public static boolean isPunctuation(char c) { return Pattern.matches("[\\p{Punct}\\p{IsPunctuation}]", String.valueOf(c)); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy