com.basistech.rosette.dm.Token Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of adm-model Show documentation
ADM data model.
There is a newer version: 3.0.3
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.rosette.dm;

import com.google.common.base.Objects;
import com.google.common.collect.Lists;

import java.io.Serializable;
import java.util.List;
import java.util.Map;

/**
 * The token. The definition of a token can vary by language, but
 * generally a token corresponds to a word.
 */
public class Token extends Attribute implements Serializable {
    private static final long serialVersionUID = 222L;
    // we don't want to have to go look at the parent {@link AnnotatedText}.
    private final String text;
    private final List normalized;
    private final List analyses;
    private final String source;

    protected Token(int startOffset,
                    int endOffset,
                    String text,
                    List normalized,
                    String source,
                    List analyses,
                    Map extendedProperties) {
        super(startOffset, endOffset, extendedProperties);
        this.text = text;
        this.normalized = listOrNull(normalized);
        this.source = source;
        this.analyses = listOrNull(analyses);
    }

    /**
     * Returns the text of the token.
     * Note that, in some languages, the text may not be a substring of
     * the character data stored in the {@link com.basistech.rosette.dm.AnnotatedText}.
     * For example, a Chinese token could start at the end of a line and continue
     * to the next line.  The raw text would include the newline character, but
     * the token would not.
     *
     * @return the text of the token
     */
    public String getText() {
        return text;
    }

    /**
     * Returns the normalized form of the token.
     *
     * @return the normalized form of the token
     */
    public List getNormalized() {
        return normalized;
    }

    /**
     * Returns the list of analyses. Note: the items of this list are of the smallest type needed. So, even if the text
     * is Arabic or Chinese, some of the items in this list may be {@link com.basistech.rosette.dm.MorphoAnalysis}, not
     * the corresponding subclass. Callers must use instanceof to check if a particular item is of the subclass.
     *
     * @return the list of analyses
     */
    public List getAnalyses() {
        return analyses;
    }

    /**
     * Returns the source of this token.
     * This identifies the component that performed the tokenization.
     *
     * @return the source of this token
     */
    public String getSource() {
        return source;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        if (!super.equals(o)) {
            return false;
        }

        Token token = (Token) o;

        if (analyses != null ? !analyses.equals(token.analyses) : token.analyses  != null) {
            return false;
        }
        if (normalized != null ? !normalized.equals(token.normalized) : token.normalized != null) {
            return false;
        }
        if (source != null ? !source.equals(token.source) : token.source != null) {
            return false;
        }
        return !(text != null ? !text.equals(token.text) : token.text != null);
    }

    @Override
    public int hashCode() {
        int result = super.hashCode();
        result = 31 * result + text.hashCode();
        if (normalized != null) {
            result = 31 * result + normalized.hashCode();
        }
        if (analyses != null) {
            result = 31 * result + analyses.hashCode();
        }
        if (source != null) {
            result = 31 * result + source.hashCode();
        }
        return result;
    }

    @Override
    protected Objects.ToStringHelper toStringHelper() {
        return Objects.toStringHelper(this)
                .add("text", text)
                .add("normalized", normalized)
                .add("analyses", analyses)
                .add("source", source);
    }

    /**
     * Builder for tokens.
     */
    public static class Builder extends Attribute.Builder {
        private String text;
        private List normalized;
        private List analyses;

        private String source;

        /**
         * Constructs a builder from the required properties.
         *
         * @param startOffset the start offset in characters
         * @param endOffset the end offset in characters
         * @param text the text of the token
         */
        public Builder(int startOffset, int endOffset, String text) {
            super(startOffset, endOffset);
            this.text = text;
            this.analyses = Lists.newArrayList();
            this.source = null;
            normalized = Lists.newArrayList();
        }

        /**
         * Constructs a builder from the values of an existing token.
         *
         * @param toCopy existing token to copy
         * @adm.ignore
         */
        public Builder(Token toCopy) {
            super(toCopy);
            text = toCopy.text;
            normalized = Lists.newArrayList();
            analyses = Lists.newArrayList();
            addAllToList(normalized, toCopy.normalized);
            addAllToList(analyses, toCopy.getAnalyses());
        }

        /**
         * Specifies the text.
         *
         * @param text the text
         * @return this
         */
        public Builder text(String text) {
            this.text = text;
            return this;
        }

        /**
         * Adds a normalized form.
         *
         * @param normalized the normalized form
         * @return this
         */
        public Builder addNormalized(String normalized) {
            this.normalized.add(normalized);
            return this;
        }

        /**
         * Sets the list of normalized forms.
         * @param normalized the normalized token forms.
         * @return this.
         */
        public Builder normalized(List normalized) {
            this.normalized = nullOrList(normalized);
            return this;
        }

        /**
         * Specifies the source of this token.
         *
         * @param source the source
         * @return this
         */
        public Builder source(String source) {
            this.source = source;
            return this;
        }

        /**
         * Adds an analysis.
         *
         * @param analysis the analysis
         * @return this
         */
        public Builder addAnalysis(MorphoAnalysis analysis) {
            this.analyses.add(analysis);
            return this;
        }

        /**
         * Sets the list of morphological analyses.
         * @param analyses the analyses.
         * @return this.
         */
        public Builder analyses(List analyses) {
            this.analyses = nullOrList(analyses);
            return this;
        }

        /**
         * Creates a new immutable Token object from the current state of the builder.
         *
         * @return the new token
         */
        public Token build() {
            return new Token(startOffset, endOffset, text, normalized, source, analyses, buildExtendedProperties());
        }

        @Override
        protected Builder getThis() {
            return this;
        }
    }
}