All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.compile.CharacterDefinitionsCompiler Maven / Gradle / Ivy

/**
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.compile;

import com.atilika.kuromoji.io.IntegerArrayIO;
import com.atilika.kuromoji.io.StringArrayIO;

import java.io.*;
import java.util.*;

public class CharacterDefinitionsCompiler implements Compiler {

    private Map categoryDefinitions = new TreeMap<>();

    @SuppressWarnings("unchecked")
    private List> codepointCategories = new ArrayList<>(new TreeSet());

    private OutputStream output;

    public CharacterDefinitionsCompiler(OutputStream output) {
        this.output = output;

        for (int i = 0; i < 65536; i++) {
            codepointCategories.add(null);
        }
    }

    public void readCharacterDefinition(InputStream stream, String encoding) throws IOException {
        LineNumberReader reader = new LineNumberReader(
            new InputStreamReader(stream, encoding)
        );

        String line;

        while ((line = reader.readLine()) != null) {
            // Strip comments
            line = line.replaceAll("\\s*#.*", "");

            // Skip empty line or comment line
            if (line.isEmpty()) {
                continue;
            }

            if (isCategoryEntry(line)) {
                parseCategory(line);
            } else {
                parseMapping(line);
            }
        }
    }

    private void parseCategory(String line) {
        String[] values = line.split("\\s+");

        String classname = values[0];
        int invoke = Integer.parseInt(values[1]);
        int group = Integer.parseInt(values[2]);
        int length = Integer.parseInt(values[3]);

        assert !categoryDefinitions.containsKey(classname);

        categoryDefinitions.put(
            classname,
            new int[]{invoke, group, length}
        );
    }

    private void parseMapping(String line) {
        String[] values = line.split("\\s+");

        assert values.length >= 2;

        String codepointString = values[0];
        List categories = getCategories(values);

        if (codepointString.contains("..")) {
            String[] codepoints = codepointString.split("\\.\\.");

            int lowerCodepoint = Integer.decode(codepoints[0]);
            int upperCodepoint = Integer.decode(codepoints[1]);

            for (int i = lowerCodepoint; i <= upperCodepoint; i++) {
                addMapping(i, categories);
            }

        } else {
            int codepoint = Integer.decode(codepointString);

            addMapping(codepoint, categories);
        }
    }

    private List getCategories(String[] values) {
        return Arrays.asList(values).subList(1, values.length);
    }

    private void addMapping(int codepoint, List categories) {
        for (String category : categories) {
            addMapping(codepoint, category);
        }
    }

    private void addMapping(int codepoint, String category) {
        Set categories = codepointCategories.get(codepoint);

        if (categories == null) {
            categories = new TreeSet<>();
            codepointCategories.set(codepoint, categories);
        }

        categories.add(category);
    }

    private boolean isCategoryEntry(String line) {
        return !line.startsWith("0x");
    }

    public Map makeCharacterCategoryMap() {
        Map classMapping = new TreeMap<>();
        int i = 0;

        for (String category : categoryDefinitions.keySet()) {
            classMapping.put(category, i++);
        }
        return classMapping;
    }

    private int[][] makeCharacterDefinitions() {
        Map categoryMap = makeCharacterCategoryMap();
        int size = categoryMap.size();
        int[][] array = new int[size][];

        for (String category : categoryDefinitions.keySet()) {
            int[] values = categoryDefinitions.get(category);

            assert values.length == 3;

            int index = categoryMap.get(category);
            array[index] = values;
        }

        return array;
    }

    private int[][] makeCharacterMappings() {
        Map categoryMap = makeCharacterCategoryMap();

        int size = codepointCategories.size();
        int[][] array = new int[size][];

        for (int i = 0; i < size; i++) {
            Set categories = codepointCategories.get(i);

            if (categories != null) {
                int innerSize = categories.size();
                int[] inner = new int[innerSize];

                int j = 0;

                for (String value : categories) {
                    inner[j++] = categoryMap.get(value);
                }
                array[i] = inner;
            }
        }

        return array;
    }

    private String[] makeCharacterCategorySymbols() {
        Map categoryMap = makeCharacterCategoryMap();
        Map inverted = new TreeMap<>();

        for (String key : categoryMap.keySet()) {
            inverted.put(categoryMap.get(key), key);
        }

        String[] categories = new String[inverted.size()];

        for (Integer index : inverted.keySet()) {
            categories[index] = inverted.get(index);
        }

        return categories;
    }

    public Map getCategoryDefinitions() {
        return categoryDefinitions;
    }

    public List> getCodepointCategories() {
        return codepointCategories;
    }

    @Override
    public void compile() throws IOException {
        IntegerArrayIO.writeSparseArray2D(output, makeCharacterDefinitions());
        IntegerArrayIO.writeSparseArray2D(output, makeCharacterMappings());
        StringArrayIO.writeArray(output, makeCharacterCategorySymbols());
        output.close();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy