org.carrot2.core.Document Maven / Gradle / Ivy

Go to download

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.core;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.carrot2.util.MapUtils;
import org.carrot2.util.simplexml.SimpleXmlWrapperValue;
import org.carrot2.util.simplexml.SimpleXmlWrappers;
import org.simpleframework.xml.Attribute;
import org.simpleframework.xml.Element;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.ElementMap;
import org.simpleframework.xml.Root;

import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.shaded.guava.common.collect.Ordering;
import org.carrot2.shaded.guava.common.collect.Sets;

/**
 * A document that to be processed by the framework. Each document is a collection of
 * fields carrying different bits of information, e.g. {@link #TITLE} or
 * {@link #CONTENT_URL}.
 */
@Root(name = "document")
@JsonAutoDetect(
    creatorVisibility  = JsonAutoDetect.Visibility.NONE,
    fieldVisibility    = JsonAutoDetect.Visibility.NONE,
    getterVisibility   = JsonAutoDetect.Visibility.NONE,
    isGetterVisibility = JsonAutoDetect.Visibility.NONE,
    setterVisibility   = JsonAutoDetect.Visibility.NONE)
@JsonSerialize()
@JsonInclude(JsonInclude.Include.NON_NULL)
public final class Document implements Cloneable
{
    /** Field name for the title of the document. */
    public static final String TITLE = "title";

    /**
     * Field name for a short summary of the document, e.g. the snippet returned by the
     * search engine.
     */
    public static final String SUMMARY = "snippet";

    /** Field name for an URL pointing to the full version of the document. */
    public static final String CONTENT_URL = "url";

    /**
     * Click URL. The URL that should be placed in the anchor to the document instead of
     * the value returned in {@link #CONTENT_URL}.
     */
    public static final String CLICK_URL = "click-url";

    /**
     * Field name for an URL pointing to the thumbnail image associated with the document.
     */
    public static final String THUMBNAIL_URL = "thumbnail-url";

    /** Document size. */
    public static final String SIZE = "size";

    /**
     * Document score. The semantics of the score depends on the specific document source.
     * Some document sources may not provide document scores at all.
     */
    public static final String SCORE = "score";

    /**
     * Field name for a list of sources the document was found in. Value type:
     * List<String>
     */
    public static final String SOURCES = "sources";

    /**
     * Field name for the language in which the document is written. Value type:
     * {@link LanguageCode}. If the language field is not defined or is
     * null, it means the language of the document is unknown or it is
     * outside of the list defined in {@link LanguageCode}.
     */
    public static final String LANGUAGE = "language";

    /**
     * Identifiers of reference clustering partitions this document belongs to. Currently,
     * this field is used only to calculate various clustering quality metrics. In the
     * future, clustering algorithms may be able to use values of this field to increase
     * the quality of clustering.
     * 
     * Value type: Collection<Object>. There is no constraint on the
     * actual type of the partition identifier in the collection. Identifiers are assumed
     * to correctly implement the {@link #equals(Object)} and {@link #hashCode()} methods.
     * 
     */
    public static final String PARTITIONS = "partitions";

    /** Fields of this document */
    private final Map fields = Maps.newHashMap();

    /** Read-only collection of fields exposed in {@link #getField(String)}. */
    private final Map fieldsView = Collections.unmodifiableMap(fields);

    /**
     * @see #getStringId()
     * @see ProcessingResult
     */
    @Attribute(required = false)
    String id;

    /**
     * Listeners to be notified before this document gets serialized.
     */
    private ArrayList serializationListeners;

    /**
     * Creates an empty document with no fields.
     */
    public Document()
    {
    }

    /**
     * Creates a document with the provided title.
     */
    public Document(String title)
    {
        this(title, null);
    }

    /**
     * Creates a document with the provided title and summary.
     */
    public Document(String title, String summary)
    {
        this(title, summary, (String) null);
    }

    /**
     * Creates a document with the provided title, summary and
     * language.
     */
    public Document(String title, String summary, LanguageCode language)
    {
        this(title, summary, null, language);
    }

    /**
     * Creates a document with the provided title, summary and
     * contentUrl.
     */
    public Document(String title, String summary, String contentUrl)
    {
        this(title, summary, contentUrl, null);
    }

    /**
     * Creates a document with the provided title, summary,
     * contentUrl and language.
     */
    public Document(String title, String summary, String contentUrl, LanguageCode language)
    {
        setField(TITLE, title);
        setField(SUMMARY, summary);

        if (StringUtils.isNotBlank(contentUrl))
        {
            setField(CONTENT_URL, contentUrl);
        }

        if (language != null)
        {
            setField(LANGUAGE, language);
        }
    }

    /**
     * Creates a document with the provided title, summary,
     * contentUrl and language and ID. IDs should be unique
     * for clustering. If all documents passed for clustering have null IDs then
     * IDs are automatically generated. 
     */
    public Document(String title, String summary, String contentUrl, LanguageCode language, String id)
    {
        this(title, summary, contentUrl, language);
        this.id = id;
    }

    /**
     * @deprecated please use {@link #getStringId()} instead. Currently, this method
     *             attempts to parse the string identifier returned by
     *             {@link #getStringId()} into an integer.
     * @throws NumberFormatException if the identifier could not be converted to an integer
     *             number
     */
    public Integer getId()
    {
        try {
            return id != null ? Integer.parseInt(id) : null;
        } catch (NumberFormatException e) {
            throw new NumberFormatException("Could not parse document identifier as an integer: " + id);
        }
    }

    /**
     * Identifier of this document. The semantics of the identifier varies depending on
     * the {@link IDocumentSource} that produced the documents.
     * 
     * When processing documents produced by Carrot2-provided {@link IDocumentSource}, the
     * framework generates unique integer identifiers for all the documents. However, when
     * XML document sets are loaded using the
     * {@link ProcessingResult#deserialize(java.io.InputStream)} or
     * {@link ProcessingResult#deserialize(CharSequence)} methods, the original document
     * identifiers are preserved, which means they may be non-unique or not present at all.
     * 
     * 
     * @return identifier of this document, possibly null
     */
    @JsonProperty("id")
    public String getStringId()
    {
        return id;
    }

    /**
     * Returns this document's {@link #TITLE} field.
     */
    @JsonProperty
    @Element(required = false)
    public String getTitle()
    {
        return getField(TITLE);
    }

    /**
     * Sets this document's {@link #TITLE} field.
     * 
     * @param title title to set
     * @return this document for convenience
     */
    @Element(required = false)
    public Document setTitle(String title)
    {
        return setField(TITLE, title);
    }

    /**
     * Returns this document's {@link #SUMMARY} field.
     */
    @JsonProperty("snippet")
    @Element(name = "snippet", required = false)
    public String getSummary()
    {
        return getField(SUMMARY);
    }

    /**
     * Sets this document's {@link #SUMMARY} field.
     * 
     * @param summary summary to set
     * @return this document for convenience
     */
    @Element(name = "snippet", required = false)
    public Document setSummary(String summary)
    {
        return setField(SUMMARY, summary);
    }

    /**
     * Returns this document's {@link #CONTENT_URL} field.
     */
    @JsonProperty("url")
    @Element(name = "url", required = false)
    public String getContentUrl()
    {
        return getField(CONTENT_URL);
    }

    /**
     * Sets this document's {@link #CONTENT_URL} field.
     * 
     * @param contentUrl content URL to set
     * @return this document for convenience
     */
    @Element(name = "url", required = false)
    public Document setContentUrl(String contentUrl)
    {
        return setField(CONTENT_URL, contentUrl);
    }

    /**
     * Returns this document's {@link #SOURCES} field.
     */
    @JsonProperty
    @ElementList(entry = "source", required = false)
    public List getSources()
    {
        return getField(SOURCES);
    }

    /**
     * Sets this document's {@link #SOURCES} field.
     * 
     * @param sources the sources list to set
     * @return this document for convenience
     */
    @ElementList(entry = "source", required = false)
    public Document setSources(List sources)
    {
        return setField(SOURCES, sources);
    }

    /**
     * Returns this document's {@link #LANGUAGE}.
     */
    public LanguageCode getLanguage()
    {
        return getField(LANGUAGE);
    }

    /**
     * Sets this document's {@link #LANGUAGE}.
     * 
     * @param language the language to set
     * @return this document for convenience
     */
    public Document setLanguage(LanguageCode language)
    {
        return setField(LANGUAGE, language);
    }

    /**
     * Returns this document's {@link #SCORE}.
     * 
     * @return this document's {@link #SCORE}.
     */
    @Attribute(name = "score", required = false)
    public Double getScore()
    {
        return getField(SCORE);
    }

    /**
     * Sets this document's {@link #SCORE}.
     * 
     * @param score the {@link #SCORE} to set
     * @return this document for convenience.
     */
    @Attribute(name = "score", required = false)
    public Document setScore(Double score)
    {
        return setField(SCORE, score);
    }

    @JsonProperty("language")
    @Attribute(required = false, name = "language")
    private String getLanguageIsoCode()
    {
        final LanguageCode language = getLanguage();
        return language != null ? language.getIsoCode() : null;
    }

    @Attribute(required = false, name = "language")
    private void setLanguageIsoCode(String languageIsoCode)
    {
        if (languageIsoCode != null)
        {
            final LanguageCode language = LanguageCode.forISOCode(languageIsoCode);
            if (language != null)
            {
                setLanguage(language);
            }
            else
            {
                // Try by enum name for backward-compatibility
                setLanguage(LanguageCode.valueOf(languageIsoCode));
            }
        }
        else
        {
            setLanguage(null);
        }
    }

    /**
     * For JSON and XML serialization only.
     */
    @JsonProperty("fields")
    private Map getOtherFields()
    {
        final Map otherFields;

        // If a caching controller is used, concurrent threads can operate on the same
        // instance of the Document class, so we need to synchronize here to avoid
        // ConcurrentModificationExceptions.
        synchronized (this)
        {
            otherFields = Maps.newHashMap(fields);
        }
        otherFields.remove(TITLE);
        otherFields.remove(SUMMARY);
        otherFields.remove(CONTENT_URL);
        otherFields.remove(SOURCES);
        otherFields.remove(LANGUAGE);
        otherFields.remove(SCORE);
        fireSerializationListeners(otherFields);
        return otherFields.isEmpty() ? null : otherFields;
    }

    /*
     * 
     */
    @ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false)
    private HashMap getOtherFieldsXml()
    {
        final HashMap otherFieldsForSerialization;
        synchronized (this)
        {
            otherFieldsForSerialization = MapUtils.asHashMap(SimpleXmlWrappers
                .wrap(fields));
        }
        otherFieldsForSerialization.remove(TITLE);
        otherFieldsForSerialization.remove(SUMMARY);
        otherFieldsForSerialization.remove(CONTENT_URL);
        otherFieldsForSerialization.remove(SOURCES);
        otherFieldsForSerialization.remove(LANGUAGE);
        otherFieldsForSerialization.remove(SCORE);
        fireSerializationListeners(otherFieldsForSerialization);
        return otherFieldsForSerialization.isEmpty() ? null : otherFieldsForSerialization;
    }

    /*
     * 
     */
    @ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false)
    private void setOtherFieldsXml(
        HashMap otherFieldsForSerialization)
    {
        if (otherFieldsForSerialization != null)
        {
            // No need to synchronize here, the object is being deserialized,
            // so it can't yet be seen by other threads.
            fields.putAll(SimpleXmlWrappers.unwrap(otherFieldsForSerialization));
        }
    }

    /**
     * Returns all fields of this document. The returned map is unmodifiable.
     * 
     * @return all fields of this document
     */
    public Map getFields()
    {
        return fieldsView;
    }

    /**
     * Returns value of the specified field of this document. If no field corresponds to
     * the provided name, null will be returned.
     * 
     * @param name of the field to be returned
     * @return value of the field or null
     */
    @SuppressWarnings("unchecked")
    public  T getField(String name)
    {
        synchronized (this)
        {
            return (T) fields.get(name);
        }
    }

    /**
     * Sets a field in this document.
     * 
     * @param name of the field to set
     * @param value value of the field
     * @return this document for convenience
     */
    public Document setField(String name, Object value)
    {
        synchronized (this)
        {
            fields.put(name, value);
        }
        return this;
    }
    
    /**
     * Creates a shallow clone of itself. The identifier
     * and the fields map is copied but values inside fields are not cloned. 
     */
    @Override
    public Document clone()
    {
        Document clone = new Document();
        clone.id = this.id;
        clone.fields.putAll(this.fields);
        return clone;
    }

    /**
     * Assigns sequential identifiers to the provided documents. If any
     * document in the set has a non-empty identifier, no identifiers will be generated at
     * all.
     * 
     * @param documents documents to assign identifiers to.
     * @throws IllegalArgumentException Thrown if the collection of documents already contains
     *              identifiers and they are not unique.
     */
    public static void assignDocumentIds(Collection documents)
    {
        // We may get concurrent calls referring to the same documents
        // in the same list, so we need to synchronize here.
        synchronized (documents)
        {
            // Make sure there are no identifiers. Or if they are present, they should be unique.
            boolean hadIds = false;
            for (Document document : documents)
            {
                if (document.id != null)
                {
                    hadIds = true;
                    break;
                }
            }

            if (hadIds)
            {
                final HashSet ids = Sets.newHashSet();
                for (Document doc : documents)
                {
                    String id = doc.getStringId();
                    if (!ids.add(id) && id != null)
                    {
                      throw new IllegalArgumentException(
                          "Identifiers must be unique, duplicated identifier: " + id + 
                          " [existing: " + ids.toString() + "]");
                    }
                }

                if (ids.contains(null))
                {
                    throw new IllegalArgumentException(
                        "Null identifiers cannot be mixed with existing non-null identifiers: " +
                        " [existing: " + ids.toString() + "]");
                }
            }
            else
            {
                // All nulls, assign ids.
                int id = 0;
                for (final Document document : documents)
                {
                    document.id = Integer.toString(id);
                    id++;
                }
            }
        }
    }

    /**
     * Transforms a {@link Document} to its identifier returned by
     * {@link Document#getId()}.
     * 
     * @deprecated Please use #getStringId() directly or use your own {@link Function}
     *             implementation.
     */
    public static final class DocumentToId implements Function
    {
        public static final DocumentToId INSTANCE = new DocumentToId();

        private DocumentToId()
        {
        }

        public Integer apply(Document document)
        {
            return document.getId();
        }
    }

    /**
     * Compares {@link Document}s by their identifiers {@link #getId()}, which effectively
     * gives the original order in which they were returned by the document source.
     * 
     * @deprecated semantics of the identifiers depends on the document source, please
     *             roll your own comparator that is aware of the actual id semantics.
     */
    public static final Comparator BY_ID_COMPARATOR = Ordering.natural()
        .nullsFirst().onResultOf(DocumentToId.INSTANCE);

    /**
     * Adds a serialization listener to this document.
     * 
     * @param listener the listener to add
     */
    public void addSerializationListener(IDocumentSerializationListener listener)
    {
        synchronized (this)
        {
            if (serializationListeners == null)
            {
                serializationListeners = Lists.newArrayList();
            }
            serializationListeners.add(listener);
        }
    }

    /**
     * Enables listening to events related to XML/JSON serialization of {@link Document}s.
     */
    public static interface IDocumentSerializationListener
    {
        /**
         * Called before a {@link Document} gets serialized to XML or JSON. Specific
         * implementations may want to modify some properties of the document before it
         * gets serialized
         * 
         * @param document the documents being serialized. Note: changes to the document
         *            will not be undone after serialization completes.
         * @param otherFieldsForSerialization custom fields that are about to be
         *            serialized. Changes made on this map will not affect the contents of
         *            the document.
         */
        public void beforeSerialization(Document document,
            Map otherFieldsForSerialization);
    }

    private void fireSerializationListeners(Map otherFieldsForSerialization)
    {
        synchronized (this)
        {
            if (serializationListeners != null)
            {
                for (IDocumentSerializationListener listener : serializationListeners)
                {
                    listener.beforeSerialization(this, otherFieldsForSerialization);
                }
            }
        }
    }
}