All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atilika.kuromoji.TokenizerBase Maven / Gradle / Ivy

/*-*
 * Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji;

import com.atilika.kuromoji.dict.*;
import com.atilika.kuromoji.trie.DoubleArrayTrie;
import com.atilika.kuromoji.util.ResourceResolver;
import com.atilika.kuromoji.viterbi.*;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;

/**
 * TokenizerBase main class
 */
public abstract class TokenizerBase {

    public enum Mode {
        NORMAL, SEARCH, EXTENDED
    }

    private ViterbiBuilder viterbiBuilder;

    private ViterbiSearcher viterbiSearcher;

    private ViterbiFormatter viterbiFormatter;

    private boolean split;

    private TokenInfoDictionary tokenInfoDictionary;

    private UnknownDictionary unknownDictionary;

    private UserDictionary userDictionary;

    private InsertedDictionary insertedDictionary;

    protected TokenFactory tokenFactory;

    protected EnumMap dictionaryMap = new EnumMap<>(ViterbiNode.Type.class);

    protected void configure(Builder builder) {

        builder.loadDictionaries();

        this.tokenFactory = builder.tokenFactory;

        this.tokenInfoDictionary = builder.tokenInfoDictionary;
        this.unknownDictionary = builder.unknownDictionary;
        this.userDictionary = builder.userDictionary;
        this.insertedDictionary = builder.insertedDictionary;

        this.viterbiBuilder = new ViterbiBuilder(builder.doubleArrayTrie, tokenInfoDictionary, unknownDictionary,
                        userDictionary, builder.mode);

        this.viterbiSearcher = new ViterbiSearcher(builder.mode, builder.connectionCosts, unknownDictionary,
                        builder.penalties);

        this.viterbiFormatter = new ViterbiFormatter(builder.connectionCosts);
        this.split = builder.split;

        initDictionaryMap();
    }

    private void initDictionaryMap() {
        dictionaryMap.put(ViterbiNode.Type.KNOWN, tokenInfoDictionary);
        dictionaryMap.put(ViterbiNode.Type.UNKNOWN, unknownDictionary);
        dictionaryMap.put(ViterbiNode.Type.USER, userDictionary);
        dictionaryMap.put(ViterbiNode.Type.INSERTED, insertedDictionary);
    }

    public List tokenize(String text) {
        return createTokenList(text);
    }


    /**
     * Tokenizes the provided text and returns a list of tokens with various feature information
     * 

* This method is thread safe * * @param text text to tokenize * @param token type * @return list of Token, not null */ protected List createTokenList(String text) { if (!split) { return createTokenList(0, text); } List splitPositions = getSplitPositions(text); if (splitPositions.isEmpty()) { return createTokenList(0, text); } ArrayList result = new ArrayList<>(); int offset = 0; for (int position : splitPositions) { result.addAll(this.createTokenList(offset, text.substring(offset, position + 1))); offset = position + 1; } if (offset < text.length()) { result.addAll(this.createTokenList(offset, text.substring(offset))); } return result; } /** * Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream *

* The output is written in DOT format. *

* This method is not thread safe * * @param outputStream output stream to write to * @param text text to tokenize * @throws IOException if an error occurs when writing the lattice and path */ public void debugTokenize(OutputStream outputStream, String text) throws IOException { ViterbiLattice lattice = viterbiBuilder.build(text); List bestPath = viterbiSearcher.search(lattice); outputStream.write(viterbiFormatter.format(lattice, bestPath).getBytes(StandardCharsets.UTF_8)); outputStream.flush(); } /** * Writes the Viterbi lattice for the provided text to an output stream *

* The output is written in DOT format. *

* This method is not thread safe * * @param outputStream output stream to write to * @param text text to create lattice for * @throws IOException if an error occurs when writing the lattice */ public void debugLattice(OutputStream outputStream, String text) throws IOException { ViterbiLattice lattice = viterbiBuilder.build(text); outputStream.write(viterbiFormatter.format(lattice).getBytes(StandardCharsets.UTF_8)); outputStream.flush(); } /** * Split input text at 句読点, which is 。 and 、 * * @param text * @return list of split position */ private List getSplitPositions(String text) { ArrayList splitPositions = new ArrayList<>(); int position; int currentPosition = 0; while (true) { int indexOfMaru = text.indexOf("。", currentPosition); int indexOfTen = text.indexOf("、", currentPosition); if (indexOfMaru < 0 || indexOfTen < 0) { position = Math.max(indexOfMaru, indexOfTen); } else { position = Math.min(indexOfMaru, indexOfTen); } if (position >= 0) { splitPositions.add(position); currentPosition = position + 1; } else { break; } } return splitPositions; } /** * Tokenize input sentence. * * @param offset offset of sentence in original input text * @param text sentence to tokenize * @return list of Token */ private List createTokenList(int offset, String text) { ArrayList result = new ArrayList<>(); ViterbiLattice lattice = viterbiBuilder.build(text); List bestPath = viterbiSearcher.search(lattice); for (ViterbiNode node : bestPath) { int wordId = node.getWordId(); if (node.getType() == ViterbiNode.Type.KNOWN && wordId == -1) { // Do not include BOS/EOS continue; } @SuppressWarnings("unchecked") T token = (T) tokenFactory.createToken(wordId, node.getSurface(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); result.add(token); } return result; } /** * Abstract Builder shared by all tokenizers */ public abstract static class Builder { protected DoubleArrayTrie doubleArrayTrie; protected ConnectionCosts connectionCosts; protected TokenInfoDictionary tokenInfoDictionary; protected UnknownDictionary unknownDictionary; protected CharacterDefinitions characterDefinitions; protected InsertedDictionary insertedDictionary; protected UserDictionary userDictionary = null; protected Mode mode = Mode.NORMAL; protected boolean split = true; protected List penalties = Collections.emptyList(); protected int totalFeatures = -1; protected int readingFeature = -1; protected int partOfSpeechFeature = -1; protected ResourceResolver resolver; protected TokenFactory tokenFactory; protected void loadDictionaries() { try { doubleArrayTrie = DoubleArrayTrie.newInstance(resolver); connectionCosts = ConnectionCosts.newInstance(resolver); tokenInfoDictionary = TokenInfoDictionary.newInstance(resolver); characterDefinitions = CharacterDefinitions.newInstance(resolver); unknownDictionary = UnknownDictionary.newInstance(resolver, characterDefinitions, totalFeatures); insertedDictionary = new InsertedDictionary(totalFeatures); } catch (Exception ouch) { throw new RuntimeException("Could not load dictionaries.", ouch); } } /** * Creates a Tokenizer instance defined by this Builder * * @param token type * @return Tokenizer instance */ public abstract T build(); /** * Sets an optional user dictionary as an input stream *

* The inpuut stream provided is not closed by this method * * @param input user dictionary as an input stream * @return this builder * @throws IOException if an error occurs when reading the user dictionary */ public Builder userDictionary(InputStream input) throws IOException { this.userDictionary = new UserDictionary(input, totalFeatures, readingFeature, partOfSpeechFeature); return this; } /** * Sets an optional user dictionary filename * * @param filename user dictionary filename * @return this builder * @throws IOException if an error occurs when reading the user dictionary */ public Builder userDictionary(String filename) throws IOException { InputStream input = new BufferedInputStream(new FileInputStream(filename)); this.userDictionary(input); input.close(); return this; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy