All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.xlate.edi.internal.stream.tokenization.CharacterSet Maven / Gradle / Ivy

There is a newer version: 1.25.2
Show newest version
/*******************************************************************************
 * Copyright 2017 xlate.io LLC, http://www.xlate.io
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 ******************************************************************************/
package io.xlate.edi.internal.stream.tokenization;

import java.util.Arrays;
import java.util.Map;
import java.util.TreeMap;

public class CharacterSet {

    private static final CharacterClass _SPACE = CharacterClass.SPACE;
    private static final CharacterClass _LATNA = CharacterClass.LATIN_A;
    private static final CharacterClass _LATNB = CharacterClass.LATIN_B;
    private static final CharacterClass _LATNE = CharacterClass.LATIN_E;
    private static final CharacterClass _LATNI = CharacterClass.LATIN_I;
    private static final CharacterClass _LATNN = CharacterClass.LATIN_N;
    private static final CharacterClass _LATNS = CharacterClass.LATIN_S;
    private static final CharacterClass _LATNU = CharacterClass.LATIN_U;
    private static final CharacterClass _LATNZ = CharacterClass.LATIN_Z;
    private static final CharacterClass _ALNUM = CharacterClass.ALPHANUMERIC;
    private static final CharacterClass _OTHER = CharacterClass.OTHER;
    private static final CharacterClass _WHITE = CharacterClass.WHITESPACE;
    private static final CharacterClass _CNTRL = CharacterClass.CONTROL;
    private static final CharacterClass _INVLD = CharacterClass.INVALID;

    /*
     * This array maps the 128 ASCII characters into character classes. The
     * remaining Unicode characters should be mapped to _OTHER. Control
     * characters are errors.
     */
    private static final CharacterClass[] prototype = {
                                                        _INVLD, /* 00 NUL */
                                                        _CNTRL, /* 01 SOH */
                                                        _CNTRL, /* 02 STX */
                                                        _CNTRL, /* 03 ETX */
                                                        _CNTRL, /* 04 EOT */
                                                        _CNTRL, /* 05 ENQ */
                                                        _CNTRL, /* 06 ACK */
                                                        _CNTRL, /* 07 BEL */
                                                        _INVLD, /* 08 BS */
                                                        _WHITE, /* 09 HT */
                                                        _WHITE, /* 0A LF */
                                                        _WHITE, /* 0B VT */
                                                        _WHITE, /* 0C FF */
                                                        _WHITE, /* 0D CR */
                                                        _INVLD, /* 0E SO */
                                                        _INVLD, /* 0F SI */
                                                        _INVLD, /* 10 DLE */
                                                        _CNTRL, /* 11 DC1 */
                                                        _CNTRL, /* 12 DC2 */
                                                        _CNTRL, /* 13 DC3 */
                                                        _CNTRL, /* 14 DC4 */
                                                        _CNTRL, /* 15 NAK */
                                                        _CNTRL, /* 16 SYN */
                                                        _CNTRL, /* 17 ETB */
                                                        _INVLD, /* 18 CAN */
                                                        _INVLD, /* 19 EM */
                                                        _INVLD, /* 1A SUB */
                                                        _INVLD, /* 1B ESC */
                                                        _CNTRL, /* 1C FS */
                                                        _CNTRL, /* 1D GS */
                                                        _CNTRL, /* 1E RS */
                                                        _CNTRL, /* 1F US */
                                                        _SPACE, /* 20 Space */
                                                        _OTHER, /* 21 ! */
                                                        _OTHER, /* 22 " */
                                                        _OTHER, /* 23 # */
                                                        _OTHER, /* 24 $ */
                                                        _OTHER, /* 25 % */
                                                        _OTHER, /* 26 & */
                                                        _OTHER, /* 27 ' */
                                                        _OTHER, /* 28 ( */
                                                        _OTHER, /* 29 ) */
                                                        _OTHER, /* 2A * */
                                                        _OTHER, /* 2B + */
                                                        _OTHER, /* 2C , */
                                                        _OTHER, /* 2D - */
                                                        _OTHER, /* 2E . */
                                                        _OTHER, /* 2F / */
                                                        _ALNUM, /* 30 0 */
                                                        _ALNUM, /* 31 1 */
                                                        _ALNUM, /* 32 2 */
                                                        _ALNUM, /* 33 3 */
                                                        _ALNUM, /* 34 4 */
                                                        _ALNUM, /* 35 5 */
                                                        _ALNUM, /* 36 6 */
                                                        _ALNUM, /* 37 7 */
                                                        _ALNUM, /* 38 8 */
                                                        _ALNUM, /* 39 9 */
                                                        _OTHER, /* 3A : */
                                                        _OTHER, /* 3B ';' */
                                                        _OTHER, /* 3C < */
                                                        _OTHER, /* 3D = */
                                                        _OTHER, /* 3E > */
                                                        _OTHER, /* 3F ? */
                                                        _OTHER, /* 40 @ */
                                                        _LATNA, /* 41 A */
                                                        _LATNB, /* 42 B */
                                                        _ALNUM, /* 43 C */
                                                        _ALNUM, /* 44 D */
                                                        _LATNE, /* 45 E */
                                                        _ALNUM, /* 46 F */
                                                        _ALNUM, /* 47 G */
                                                        _ALNUM, /* 48 H */
                                                        _LATNI, /* 49 I */
                                                        _ALNUM, /* 4A J */
                                                        _ALNUM, /* 4B K */
                                                        _ALNUM, /* 4C L */
                                                        _ALNUM, /* 4D M */
                                                        _LATNN, /* 4E N */
                                                        _ALNUM, /* 4F O */
                                                        _ALNUM, /* 50 P */
                                                        _ALNUM, /* 51 Q */
                                                        _ALNUM, /* 52 R */
                                                        _LATNS, /* 53 S */
                                                        _ALNUM, /* 54 T */
                                                        _LATNU, /* 55 U */
                                                        _ALNUM, /* 56 V */
                                                        _ALNUM, /* 57 W */
                                                        _ALNUM, /* 58 X */
                                                        _ALNUM, /* 59 Y */
                                                        _LATNZ, /* 5A Z */
                                                        _OTHER, /* 5B [ */
                                                        _OTHER, /* 5C \ */
                                                        _OTHER, /* 5D ] */
                                                        _OTHER, /* 5E ^ */
                                                        _OTHER, /* 5F _ */
                                                        _OTHER, /* 60 ` */
                                                        _ALNUM, /* 61 a */
                                                        _ALNUM, /* 62 b */
                                                        _ALNUM, /* 63 c */
                                                        _ALNUM, /* 64 d */
                                                        _ALNUM, /* 65 e */
                                                        _ALNUM, /* 66 f */
                                                        _ALNUM, /* 67 g */
                                                        _ALNUM, /* 68 h */
                                                        _ALNUM, /* 69 i */
                                                        _ALNUM, /* 6A j */
                                                        _ALNUM, /* 6B k */
                                                        _ALNUM, /* 6C l */
                                                        _ALNUM, /* 6D m */
                                                        _ALNUM, /* 6E n */
                                                        _ALNUM, /* 6F o */
                                                        _ALNUM, /* 70 p */
                                                        _ALNUM, /* 71 q */
                                                        _ALNUM, /* 72 r */
                                                        _ALNUM, /* 73 s */
                                                        _ALNUM, /* 74 t */
                                                        _ALNUM, /* 75 u */
                                                        _ALNUM, /* 76 v */
                                                        _ALNUM, /* 77 w */
                                                        _ALNUM, /* 78 x */
                                                        _ALNUM, /* 79 y */
                                                        _ALNUM, /* 7A z */
                                                        _OTHER, /* 7B '{' */
                                                        _OTHER, /* 7C | */
                                                        _OTHER, /* 7D '}' */
                                                        _OTHER, /* 7E ~ */
                                                        _INVLD /* 7F DEL */
    };

    private final CharacterClass[] list;
    private final Map auxilary;
    private final boolean extraneousIgnored;

    public CharacterSet() {
        this(false);
    }

    public CharacterSet(boolean extraneousIgnored) {
        this.list = Arrays.copyOf(prototype, prototype.length);
        this.auxilary = new TreeMap<>();
        this.extraneousIgnored = extraneousIgnored;
    }

    public CharacterClass getClass(int character) {
        return (character < list.length) ? list[character] : auxilary.getOrDefault(character, _OTHER);
    }

    public void reset() {
        System.arraycopy(prototype, 0, list, 0, prototype.length);
        auxilary.clear();
    }

    public void setClass(int character, CharacterClass clazz) {
        if (character < list.length) {
            list[character] = clazz;
        } else {
            auxilary.put(character, clazz);
        }
    }

    public boolean isDelimiter(int character) {
        switch (getClass(character)) {
        case ELEMENT_DELIMITER:
        case ELEMENT_REPEATER:
        case SEGMENT_DELIMITER:
        case COMPONENT_DELIMITER:
            return true;
        default:
            return false;
        }
    }

    public boolean isIgnored(int character) {
        switch (getClass(character)) {
        case CONTROL:
        case INVALID:
        case WHITESPACE:
            return extraneousIgnored;
        default:
            return false;
        }
    }

    public boolean isCharacterClass(int character, CharacterClass clazz) {
        return getClass(character).equals(clazz);
    }

    public static boolean isValid(int character) {
        if (character >= prototype.length) {
            return true;
        }

        switch (prototype[character]) {
        case CONTROL:
        case INVALID:
            return false;
        default:
            return true;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy