All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.text.TextObject Maven / Gradle / Ivy

The newest version!
package com.optimaize.langdetect.text;

import com.optimaize.langdetect.cybozu.util.CharNormalizer;
import com.google.common.annotations.Beta;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.io.Reader;

/**
 * A convenient text object implementing CharSequence and Appendable.
 *
 * This is an ideal object to use for learning text to create {@link com.optimaize.langdetect.profiles.LanguageProfile}s,
 * as well as to pass it in to {@link com.optimaize.langdetect.LanguageDetector#detect}.
 *
 * To get one, use a TextObjectFactory (through a TextObjectFactoryBuilder).
 *
 * Example use:
 * //create the factory once:
 * TextObjectFactory textObjectFactory = new TextObjectFactoryBuilder()
 *     .withTextFilter(UrlTextFilter.getInstance())
 *     .build();
 * //then create as many text objects as you like:
 * TextObject inputText = textObjectFactory.create().append("deutsche Text").append(" ").append("blah blah");
 *
 * All append() methods go through the {@code textFilter}.
 *
 * Equals/hashCode are not implemented as of now on purpose. You may want to call toString() and compare that.
 *
 * @author Fabian Kessler
 */
@Beta
public class TextObject implements CharSequence, Appendable {

    @NotNull
    private final TextFilter textFilter;

    @NotNull
    private final StringBuilder stringBuilder;

    private final int maxTextLength;


    /**
     * @param maxTextLength 0 for no limit
     */
    public TextObject(@NotNull TextFilter textFilter, int maxTextLength) {
        this.textFilter = textFilter;
        this.maxTextLength = maxTextLength;
        this.stringBuilder = new StringBuilder();
    }


    /**
     * Append the target text for language detection.
     * This method read the text from specified input reader.
     * If the total size of target text exceeds the limit size,
     * the rest is ignored.
     *
     * @param reader the input reader (BufferedReader as usual)
     * @throws java.io.IOException Can't read the reader.
     */
    public TextObject append(Reader reader) throws IOException {
        char[] buf = new char[1024];
        while (reader.ready() && (maxTextLength==0 || stringBuilder.length()0 && stringBuilder.length()>=maxTextLength) return this;

        text = textFilter.filter(text);

        //unfortunately this code can't be put into a TextFilter because:
        //1) the limit could not be detected early, a lot of work would be done to waste time and memory
        //2) the last character of the existing string builder could not be seen. if it is a space, we don't want
        //   to add yet another space.
        char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1);
        for (int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy