All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.core.Document Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.core;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.carrot2.util.MapUtils;
import org.carrot2.util.simplexml.SimpleXmlWrapperValue;
import org.carrot2.util.simplexml.SimpleXmlWrappers;
import org.simpleframework.xml.Attribute;
import org.simpleframework.xml.Element;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.ElementMap;
import org.simpleframework.xml.Root;

import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.shaded.guava.common.collect.Ordering;
import org.carrot2.shaded.guava.common.collect.Sets;

/**
 * A document that to be processed by the framework. Each document is a collection of
 * fields carrying different bits of information, e.g. {@link #TITLE} or
 * {@link #CONTENT_URL}.
 */
@Root(name = "document")
@JsonAutoDetect(
    creatorVisibility  = JsonAutoDetect.Visibility.NONE,
    fieldVisibility    = JsonAutoDetect.Visibility.NONE,
    getterVisibility   = JsonAutoDetect.Visibility.NONE,
    isGetterVisibility = JsonAutoDetect.Visibility.NONE,
    setterVisibility   = JsonAutoDetect.Visibility.NONE)
@JsonSerialize()
@JsonInclude(JsonInclude.Include.NON_NULL)
public final class Document implements Cloneable
{
    /** Field name for the title of the document. */
    public static final String TITLE = "title";

    /**
     * Field name for a short summary of the document, e.g. the snippet returned by the
     * search engine.
     */
    public static final String SUMMARY = "snippet";

    /** Field name for an URL pointing to the full version of the document. */
    public static final String CONTENT_URL = "url";

    /**
     * Click URL. The URL that should be placed in the anchor to the document instead of
     * the value returned in {@link #CONTENT_URL}.
     */
    public static final String CLICK_URL = "click-url";

    /**
     * Field name for an URL pointing to the thumbnail image associated with the document.
     */
    public static final String THUMBNAIL_URL = "thumbnail-url";

    /** Document size. */
    public static final String SIZE = "size";

    /**
     * Document score. The semantics of the score depends on the specific document source.
     * Some document sources may not provide document scores at all.
     */
    public static final String SCORE = "score";

    /**
     * Field name for a list of sources the document was found in. Value type:
     * List<String>
     */
    public static final String SOURCES = "sources";

    /**
     * Field name for the language in which the document is written. Value type:
     * {@link LanguageCode}. If the language field is not defined or is
     * null, it means the language of the document is unknown or it is
     * outside of the list defined in {@link LanguageCode}.
     */
    public static final String LANGUAGE = "language";

    /**
     * Identifiers of reference clustering partitions this document belongs to. Currently,
     * this field is used only to calculate various clustering quality metrics. In the
     * future, clustering algorithms may be able to use values of this field to increase
     * the quality of clustering.
     * 

* Value type: Collection<Object>. There is no constraint on the * actual type of the partition identifier in the collection. Identifiers are assumed * to correctly implement the {@link #equals(Object)} and {@link #hashCode()} methods. *

*/ public static final String PARTITIONS = "partitions"; /** Fields of this document */ private final Map fields = Maps.newHashMap(); /** Read-only collection of fields exposed in {@link #getField(String)}. */ private final Map fieldsView = Collections.unmodifiableMap(fields); /** * @see #getStringId() * @see ProcessingResult */ @Attribute(required = false) String id; /** * Listeners to be notified before this document gets serialized. */ private ArrayList serializationListeners; /** * Creates an empty document with no fields. */ public Document() { } /** * Creates a document with the provided title. */ public Document(String title) { this(title, null); } /** * Creates a document with the provided title and summary. */ public Document(String title, String summary) { this(title, summary, (String) null); } /** * Creates a document with the provided title, summary and * language. */ public Document(String title, String summary, LanguageCode language) { this(title, summary, null, language); } /** * Creates a document with the provided title, summary and * contentUrl. */ public Document(String title, String summary, String contentUrl) { this(title, summary, contentUrl, null); } /** * Creates a document with the provided title, summary, * contentUrl and language. */ public Document(String title, String summary, String contentUrl, LanguageCode language) { setField(TITLE, title); setField(SUMMARY, summary); if (StringUtils.isNotBlank(contentUrl)) { setField(CONTENT_URL, contentUrl); } if (language != null) { setField(LANGUAGE, language); } } /** * Creates a document with the provided title, summary, * contentUrl and language and ID. IDs should be unique * for clustering. If all documents passed for clustering have null IDs then * IDs are automatically generated. */ public Document(String title, String summary, String contentUrl, LanguageCode language, String id) { this(title, summary, contentUrl, language); this.id = id; } /** * @deprecated please use {@link #getStringId()} instead. Currently, this method * attempts to parse the string identifier returned by * {@link #getStringId()} into an integer. * @throws NumberFormatException if the identifier could not be converted to an integer * number */ public Integer getId() { try { return id != null ? Integer.parseInt(id) : null; } catch (NumberFormatException e) { throw new NumberFormatException("Could not parse document identifier as an integer: " + id); } } /** * Identifier of this document. The semantics of the identifier varies depending on * the {@link IDocumentSource} that produced the documents. *

* When processing documents produced by Carrot2-provided {@link IDocumentSource}, the * framework generates unique integer identifiers for all the documents. However, when * XML document sets are loaded using the * {@link ProcessingResult#deserialize(java.io.InputStream)} or * {@link ProcessingResult#deserialize(CharSequence)} methods, the original document * identifiers are preserved, which means they may be non-unique or not present at all. *

* * @return identifier of this document, possibly null */ @JsonProperty("id") public String getStringId() { return id; } /** * Returns this document's {@link #TITLE} field. */ @JsonProperty @Element(required = false) public String getTitle() { return getField(TITLE); } /** * Sets this document's {@link #TITLE} field. * * @param title title to set * @return this document for convenience */ @Element(required = false) public Document setTitle(String title) { return setField(TITLE, title); } /** * Returns this document's {@link #SUMMARY} field. */ @JsonProperty("snippet") @Element(name = "snippet", required = false) public String getSummary() { return getField(SUMMARY); } /** * Sets this document's {@link #SUMMARY} field. * * @param summary summary to set * @return this document for convenience */ @Element(name = "snippet", required = false) public Document setSummary(String summary) { return setField(SUMMARY, summary); } /** * Returns this document's {@link #CONTENT_URL} field. */ @JsonProperty("url") @Element(name = "url", required = false) public String getContentUrl() { return getField(CONTENT_URL); } /** * Sets this document's {@link #CONTENT_URL} field. * * @param contentUrl content URL to set * @return this document for convenience */ @Element(name = "url", required = false) public Document setContentUrl(String contentUrl) { return setField(CONTENT_URL, contentUrl); } /** * Returns this document's {@link #SOURCES} field. */ @JsonProperty @ElementList(entry = "source", required = false) public List getSources() { return getField(SOURCES); } /** * Sets this document's {@link #SOURCES} field. * * @param sources the sources list to set * @return this document for convenience */ @ElementList(entry = "source", required = false) public Document setSources(List sources) { return setField(SOURCES, sources); } /** * Returns this document's {@link #LANGUAGE}. */ public LanguageCode getLanguage() { return getField(LANGUAGE); } /** * Sets this document's {@link #LANGUAGE}. * * @param language the language to set * @return this document for convenience */ public Document setLanguage(LanguageCode language) { return setField(LANGUAGE, language); } /** * Returns this document's {@link #SCORE}. * * @return this document's {@link #SCORE}. */ @Attribute(name = "score", required = false) public Double getScore() { return getField(SCORE); } /** * Sets this document's {@link #SCORE}. * * @param score the {@link #SCORE} to set * @return this document for convenience. */ @Attribute(name = "score", required = false) public Document setScore(Double score) { return setField(SCORE, score); } @JsonProperty("language") @Attribute(required = false, name = "language") private String getLanguageIsoCode() { final LanguageCode language = getLanguage(); return language != null ? language.getIsoCode() : null; } @Attribute(required = false, name = "language") private void setLanguageIsoCode(String languageIsoCode) { if (languageIsoCode != null) { final LanguageCode language = LanguageCode.forISOCode(languageIsoCode); if (language != null) { setLanguage(language); } else { // Try by enum name for backward-compatibility setLanguage(LanguageCode.valueOf(languageIsoCode)); } } else { setLanguage(null); } } /** * For JSON and XML serialization only. */ @JsonProperty("fields") private Map getOtherFields() { final Map otherFields; // If a caching controller is used, concurrent threads can operate on the same // instance of the Document class, so we need to synchronize here to avoid // ConcurrentModificationExceptions. synchronized (this) { otherFields = Maps.newHashMap(fields); } otherFields.remove(TITLE); otherFields.remove(SUMMARY); otherFields.remove(CONTENT_URL); otherFields.remove(SOURCES); otherFields.remove(LANGUAGE); otherFields.remove(SCORE); fireSerializationListeners(otherFields); return otherFields.isEmpty() ? null : otherFields; } /* * */ @ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false) private HashMap getOtherFieldsXml() { final HashMap otherFieldsForSerialization; synchronized (this) { otherFieldsForSerialization = MapUtils.asHashMap(SimpleXmlWrappers .wrap(fields)); } otherFieldsForSerialization.remove(TITLE); otherFieldsForSerialization.remove(SUMMARY); otherFieldsForSerialization.remove(CONTENT_URL); otherFieldsForSerialization.remove(SOURCES); otherFieldsForSerialization.remove(LANGUAGE); otherFieldsForSerialization.remove(SCORE); fireSerializationListeners(otherFieldsForSerialization); return otherFieldsForSerialization.isEmpty() ? null : otherFieldsForSerialization; } /* * */ @ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false) private void setOtherFieldsXml( HashMap otherFieldsForSerialization) { if (otherFieldsForSerialization != null) { // No need to synchronize here, the object is being deserialized, // so it can't yet be seen by other threads. fields.putAll(SimpleXmlWrappers.unwrap(otherFieldsForSerialization)); } } /** * Returns all fields of this document. The returned map is unmodifiable. * * @return all fields of this document */ public Map getFields() { return fieldsView; } /** * Returns value of the specified field of this document. If no field corresponds to * the provided name, null will be returned. * * @param name of the field to be returned * @return value of the field or null */ @SuppressWarnings("unchecked") public T getField(String name) { synchronized (this) { return (T) fields.get(name); } } /** * Sets a field in this document. * * @param name of the field to set * @param value value of the field * @return this document for convenience */ public Document setField(String name, Object value) { synchronized (this) { fields.put(name, value); } return this; } /** * Creates a shallow clone of itself. The identifier * and the fields map is copied but values inside fields are not cloned. */ @Override public Document clone() { Document clone = new Document(); clone.id = this.id; clone.fields.putAll(this.fields); return clone; } /** * Assigns sequential identifiers to the provided documents. If any * document in the set has a non-empty identifier, no identifiers will be generated at * all. * * @param documents documents to assign identifiers to. * @throws IllegalArgumentException Thrown if the collection of documents already contains * identifiers and they are not unique. */ public static void assignDocumentIds(Collection documents) { // We may get concurrent calls referring to the same documents // in the same list, so we need to synchronize here. synchronized (documents) { // Make sure there are no identifiers. Or if they are present, they should be unique. boolean hadIds = false; for (Document document : documents) { if (document.id != null) { hadIds = true; break; } } if (hadIds) { final HashSet ids = Sets.newHashSet(); for (Document doc : documents) { String id = doc.getStringId(); if (!ids.add(id) && id != null) { throw new IllegalArgumentException( "Identifiers must be unique, duplicated identifier: " + id + " [existing: " + ids.toString() + "]"); } } if (ids.contains(null)) { throw new IllegalArgumentException( "Null identifiers cannot be mixed with existing non-null identifiers: " + " [existing: " + ids.toString() + "]"); } } else { // All nulls, assign ids. int id = 0; for (final Document document : documents) { document.id = Integer.toString(id); id++; } } } } /** * Transforms a {@link Document} to its identifier returned by * {@link Document#getId()}. * * @deprecated Please use #getStringId() directly or use your own {@link Function} * implementation. */ public static final class DocumentToId implements Function { public static final DocumentToId INSTANCE = new DocumentToId(); private DocumentToId() { } public Integer apply(Document document) { return document.getId(); } } /** * Compares {@link Document}s by their identifiers {@link #getId()}, which effectively * gives the original order in which they were returned by the document source. * * @deprecated semantics of the identifiers depends on the document source, please * roll your own comparator that is aware of the actual id semantics. */ public static final Comparator BY_ID_COMPARATOR = Ordering.natural() .nullsFirst().onResultOf(DocumentToId.INSTANCE); /** * Adds a serialization listener to this document. * * @param listener the listener to add */ public void addSerializationListener(IDocumentSerializationListener listener) { synchronized (this) { if (serializationListeners == null) { serializationListeners = Lists.newArrayList(); } serializationListeners.add(listener); } } /** * Enables listening to events related to XML/JSON serialization of {@link Document}s. */ public static interface IDocumentSerializationListener { /** * Called before a {@link Document} gets serialized to XML or JSON. Specific * implementations may want to modify some properties of the document before it * gets serialized * * @param document the documents being serialized. Note: changes to the document * will not be undone after serialization completes. * @param otherFieldsForSerialization custom fields that are about to be * serialized. Changes made on this map will not affect the contents of * the document. */ public void beforeSerialization(Document document, Map otherFieldsForSerialization); } private void fireSerializationListeners(Map otherFieldsForSerialization) { synchronized (this) { if (serializationListeners != null) { for (IDocumentSerializationListener listener : serializationListeners) { listener.beforeSerialization(this, otherFieldsForSerialization); } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy