com.worksap.nlp.sudachi.UTF8InputText Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sudachi Show documentation
Sudachi Japanese Morphological Analyzer
There is a newer version: 0.7.4
/*
 * Copyright (c) 2020 Works Applications Co., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.worksap.nlp.sudachi;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;

import com.worksap.nlp.sudachi.dictionary.CategoryType;
import com.worksap.nlp.sudachi.dictionary.Grammar;

class UTF8InputText implements InputText {

    private final String originalText;
    private final String modifiedText;
    private final byte[] bytes;
    private final int[] byteToOriginal;
    private final int[] byteToModified;
    private final List modifiedToOriginal;
    private final List> charCategories;
    private final List charCategoryContinuities;
    private final List canBowList;

    UTF8InputText(Grammar grammar, String originalText, String modifiedText, byte[] bytes, int[] byteToOriginal,
            int[] byteToModified, List modifiedToOriginal, List> charCategories,
            List charCategoryContinuities, List canBowList) {

        this.originalText = originalText;
        this.modifiedText = modifiedText;
        this.bytes = bytes;
        this.byteToOriginal = byteToOriginal;
        this.byteToModified = byteToModified;
        this.modifiedToOriginal = modifiedToOriginal;
        this.charCategories = charCategories;
        this.charCategoryContinuities = charCategoryContinuities;
        this.canBowList = canBowList;
    }

    @Override
    public String getOriginalText() {
        return originalText;
    }

    @Override
    public String getText() {
        return modifiedText;
    }

    byte[] getByteText() {
        return bytes;
    }

    @Override
    public String getSubstring(int begin, int end) {
        if (begin < 0) {
            throw new StringIndexOutOfBoundsException(begin);
        }
        if (end > bytes.length) {
            throw new StringIndexOutOfBoundsException(end);
        }
        if (begin > end) {
            throw new StringIndexOutOfBoundsException(end - begin);
        }

        return modifiedText.substring(byteToModified[begin], byteToModified[end]);
    }

    @Override
    public UTF8InputText slice(int begin, int end) {
        if (begin < 0) {
            throw new StringIndexOutOfBoundsException(begin);
        }
        if (end > modifiedText.length()) {
            throw new StringIndexOutOfBoundsException(end);
        }
        if (begin > end) {
            throw new StringIndexOutOfBoundsException(end - begin);
        }

        int byteBegin = getCodePointsOffsetLength(0, begin);
        int length = getCodePointsOffsetLength(byteBegin, end - begin);
        int byteEnd = byteBegin + length;

        String originalText = this.originalText.substring(byteToOriginal[byteBegin], byteToOriginal[byteEnd]);
        String modifiedText = this.modifiedText.substring(begin, end);
        byte[] bytes = Arrays.copyOfRange(this.bytes, byteBegin, byteEnd);

        int[] byteToOriginal = new int[length + 1];
        for (int i = 0; i < length + 1; i++) {
            byteToOriginal[i] = this.byteToOriginal[byteBegin + i] - this.byteToOriginal[byteBegin];
        }
        int[] byteToModified = new int[length + 1];
        for (int i = 0; i < length + 1; i++) {
            byteToModified[i] = this.byteToModified[byteBegin + i] - begin;
        }
        List modifiedToOriginal = new ArrayList<>();
        for (int i = 0; i < end + 1; i++) {
            modifiedToOriginal.add(this.modifiedToOriginal.get(i) - this.modifiedToOriginal.get(begin));
        }

        List> charCategories = this.charCategories.subList(begin, end);

        List charCategoryContinuities = this.charCategoryContinuities.subList(byteBegin, byteEnd);
        if (charCategoryContinuities.get(length - 1) != 1) {
            int i = length - 1;
            int len = 1;
            while (i >= 0 && charCategoryContinuities.get(i) != 1) {
                charCategoryContinuities.set(i--, len++);
            }
        }

        List canBowList = this.canBowList.subList(begin, end);

        return new UTF8InputText(null, originalText, modifiedText, bytes, byteToOriginal, byteToModified,
                modifiedToOriginal, charCategories, charCategoryContinuities, canBowList);
    }

    int getOffsetTextLength(int index) {
        return byteToModified[index];
    }

    @Override
    public int getOriginalIndex(int index) {
        return byteToOriginal[index];
    }

    @Override
    public Set getCharCategoryTypes(int index) {
        return charCategories.get(byteToModified[index]);
    }

    @Override
    public Set getCharCategoryTypes(int begin, int end) {
        if (begin + getCharCategoryContinuousLength(begin) < end) {
            return Collections.emptySet();
        }
        int b = byteToModified[begin];
        int e = byteToModified[end];
        Set continuousCategory = charCategories.get(b).clone();
        for (int i = b + 1; i < e; i++) {
            continuousCategory.retainAll(charCategories.get(i));
        }
        return continuousCategory;
    }

    @Override
    public int getCharCategoryContinuousLength(int index) {
        return charCategoryContinuities.get(index);
    }

    @Override
    public int getCodePointsOffsetLength(int index, int codePointOffset) {
        int length = 0;
        int target = byteToModified[index] + codePointOffset;
        for (int i = index; i < bytes.length; i++) {
            if (byteToModified[i] >= target) {
                return length;
            }
            length++;
        }
        return length;
    }

    @Override
    public int codePointCount(int begin, int end) {
        return byteToModified[end] - byteToModified[begin];
    }

    @Override
    public boolean canBow(int index) {
        return isCharAlignment(index) && canBowList.get(byteToModified[index]);
    }

    @Override
    public int getWordCandidateLength(int index) {
        for (int i = index + 1; i < bytes.length; i++) {
            if (canBow(i)) {
                return i - index;
            }
        }
        return bytes.length - index;
    }

    private boolean isCharAlignment(int index) {
        return (bytes[index] & 0xC0) != 0x80;
    }

    @Override
    public int getNextInOriginal(int index) {
        int o = modifiedToOriginal.get(index + 1);
        while (index + 1 < modifiedText.length() + 1 && modifiedToOriginal.get(index + 1) == o) {
            index++;
        }
        return index;
    }
}