org.carrot2.core.Document Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.core;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.carrot2.util.MapUtils;
import org.carrot2.util.simplexml.SimpleXmlWrapperValue;
import org.carrot2.util.simplexml.SimpleXmlWrappers;
import org.simpleframework.xml.Attribute;
import org.simpleframework.xml.Element;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.ElementMap;
import org.simpleframework.xml.Root;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.shaded.guava.common.collect.Ordering;
import org.carrot2.shaded.guava.common.collect.Sets;
/**
* A document that to be processed by the framework. Each document is a collection of
* fields carrying different bits of information, e.g. {@link #TITLE} or
* {@link #CONTENT_URL}.
*/
@Root(name = "document")
@JsonAutoDetect(
creatorVisibility = JsonAutoDetect.Visibility.NONE,
fieldVisibility = JsonAutoDetect.Visibility.NONE,
getterVisibility = JsonAutoDetect.Visibility.NONE,
isGetterVisibility = JsonAutoDetect.Visibility.NONE,
setterVisibility = JsonAutoDetect.Visibility.NONE)
@JsonSerialize()
@JsonInclude(JsonInclude.Include.NON_NULL)
public final class Document implements Cloneable
{
/** Field name for the title of the document. */
public static final String TITLE = "title";
/**
* Field name for a short summary of the document, e.g. the snippet returned by the
* search engine.
*/
public static final String SUMMARY = "snippet";
/** Field name for an URL pointing to the full version of the document. */
public static final String CONTENT_URL = "url";
/**
* Click URL. The URL that should be placed in the anchor to the document instead of
* the value returned in {@link #CONTENT_URL}.
*/
public static final String CLICK_URL = "click-url";
/**
* Field name for an URL pointing to the thumbnail image associated with the document.
*/
public static final String THUMBNAIL_URL = "thumbnail-url";
/** Document size. */
public static final String SIZE = "size";
/**
* Document score. The semantics of the score depends on the specific document source.
* Some document sources may not provide document scores at all.
*/
public static final String SCORE = "score";
/**
* Field name for a list of sources the document was found in. Value type:
* List<String>
*/
public static final String SOURCES = "sources";
/**
* Field name for the language in which the document is written. Value type:
* {@link LanguageCode}. If the language
field is not defined or is
* null
, it means the language of the document is unknown or it is
* outside of the list defined in {@link LanguageCode}.
*/
public static final String LANGUAGE = "language";
/**
* Identifiers of reference clustering partitions this document belongs to. Currently,
* this field is used only to calculate various clustering quality metrics. In the
* future, clustering algorithms may be able to use values of this field to increase
* the quality of clustering.
*
* Value type: Collection<Object>
. There is no constraint on the
* actual type of the partition identifier in the collection. Identifiers are assumed
* to correctly implement the {@link #equals(Object)} and {@link #hashCode()} methods.
*
*/
public static final String PARTITIONS = "partitions";
/** Fields of this document */
private final Map fields = Maps.newHashMap();
/** Read-only collection of fields exposed in {@link #getField(String)}. */
private final Map fieldsView = Collections.unmodifiableMap(fields);
/**
* @see #getStringId()
* @see ProcessingResult
*/
@Attribute(required = false)
String id;
/**
* Listeners to be notified before this document gets serialized.
*/
private ArrayList serializationListeners;
/**
* Creates an empty document with no fields.
*/
public Document()
{
}
/**
* Creates a document with the provided title
.
*/
public Document(String title)
{
this(title, null);
}
/**
* Creates a document with the provided title
and summary
.
*/
public Document(String title, String summary)
{
this(title, summary, (String) null);
}
/**
* Creates a document with the provided title
, summary
and
* language
.
*/
public Document(String title, String summary, LanguageCode language)
{
this(title, summary, null, language);
}
/**
* Creates a document with the provided title
, summary
and
* contentUrl
.
*/
public Document(String title, String summary, String contentUrl)
{
this(title, summary, contentUrl, null);
}
/**
* Creates a document with the provided title
, summary
,
* contentUrl
and language
.
*/
public Document(String title, String summary, String contentUrl, LanguageCode language)
{
setField(TITLE, title);
setField(SUMMARY, summary);
if (StringUtils.isNotBlank(contentUrl))
{
setField(CONTENT_URL, contentUrl);
}
if (language != null)
{
setField(LANGUAGE, language);
}
}
/**
* Creates a document with the provided title
, summary
,
* contentUrl
and language
and ID. IDs should be unique
* for clustering. If all documents passed for clustering have null IDs then
* IDs are automatically generated.
*/
public Document(String title, String summary, String contentUrl, LanguageCode language, String id)
{
this(title, summary, contentUrl, language);
this.id = id;
}
/**
* @deprecated please use {@link #getStringId()} instead. Currently, this method
* attempts to parse the string identifier returned by
* {@link #getStringId()} into an integer.
* @throws NumberFormatException if the identifier could not be converted to an integer
* number
*/
public Integer getId()
{
try {
return id != null ? Integer.parseInt(id) : null;
} catch (NumberFormatException e) {
throw new NumberFormatException("Could not parse document identifier as an integer: " + id);
}
}
/**
* Identifier of this document. The semantics of the identifier varies depending on
* the {@link IDocumentSource} that produced the documents.
*
* When processing documents produced by Carrot2-provided {@link IDocumentSource}, the
* framework generates unique integer identifiers for all the documents. However, when
* XML document sets are loaded using the
* {@link ProcessingResult#deserialize(java.io.InputStream)} or
* {@link ProcessingResult#deserialize(CharSequence)} methods, the original document
* identifiers are preserved, which means they may be non-unique or not present at all.
*
*
* @return identifier of this document, possibly null
*/
@JsonProperty("id")
public String getStringId()
{
return id;
}
/**
* Returns this document's {@link #TITLE} field.
*/
@JsonProperty
@Element(required = false)
public String getTitle()
{
return getField(TITLE);
}
/**
* Sets this document's {@link #TITLE} field.
*
* @param title title to set
* @return this document for convenience
*/
@Element(required = false)
public Document setTitle(String title)
{
return setField(TITLE, title);
}
/**
* Returns this document's {@link #SUMMARY} field.
*/
@JsonProperty("snippet")
@Element(name = "snippet", required = false)
public String getSummary()
{
return getField(SUMMARY);
}
/**
* Sets this document's {@link #SUMMARY} field.
*
* @param summary summary to set
* @return this document for convenience
*/
@Element(name = "snippet", required = false)
public Document setSummary(String summary)
{
return setField(SUMMARY, summary);
}
/**
* Returns this document's {@link #CONTENT_URL} field.
*/
@JsonProperty("url")
@Element(name = "url", required = false)
public String getContentUrl()
{
return getField(CONTENT_URL);
}
/**
* Sets this document's {@link #CONTENT_URL} field.
*
* @param contentUrl content URL to set
* @return this document for convenience
*/
@Element(name = "url", required = false)
public Document setContentUrl(String contentUrl)
{
return setField(CONTENT_URL, contentUrl);
}
/**
* Returns this document's {@link #SOURCES} field.
*/
@JsonProperty
@ElementList(entry = "source", required = false)
public List getSources()
{
return getField(SOURCES);
}
/**
* Sets this document's {@link #SOURCES} field.
*
* @param sources the sources list to set
* @return this document for convenience
*/
@ElementList(entry = "source", required = false)
public Document setSources(List sources)
{
return setField(SOURCES, sources);
}
/**
* Returns this document's {@link #LANGUAGE}.
*/
public LanguageCode getLanguage()
{
return getField(LANGUAGE);
}
/**
* Sets this document's {@link #LANGUAGE}.
*
* @param language the language to set
* @return this document for convenience
*/
public Document setLanguage(LanguageCode language)
{
return setField(LANGUAGE, language);
}
/**
* Returns this document's {@link #SCORE}.
*
* @return this document's {@link #SCORE}.
*/
@Attribute(name = "score", required = false)
public Double getScore()
{
return getField(SCORE);
}
/**
* Sets this document's {@link #SCORE}.
*
* @param score the {@link #SCORE} to set
* @return this document for convenience.
*/
@Attribute(name = "score", required = false)
public Document setScore(Double score)
{
return setField(SCORE, score);
}
@JsonProperty("language")
@Attribute(required = false, name = "language")
private String getLanguageIsoCode()
{
final LanguageCode language = getLanguage();
return language != null ? language.getIsoCode() : null;
}
@Attribute(required = false, name = "language")
private void setLanguageIsoCode(String languageIsoCode)
{
if (languageIsoCode != null)
{
final LanguageCode language = LanguageCode.forISOCode(languageIsoCode);
if (language != null)
{
setLanguage(language);
}
else
{
// Try by enum name for backward-compatibility
setLanguage(LanguageCode.valueOf(languageIsoCode));
}
}
else
{
setLanguage(null);
}
}
/**
* For JSON and XML serialization only.
*/
@JsonProperty("fields")
private Map getOtherFields()
{
final Map otherFields;
// If a caching controller is used, concurrent threads can operate on the same
// instance of the Document class, so we need to synchronize here to avoid
// ConcurrentModificationExceptions.
synchronized (this)
{
otherFields = Maps.newHashMap(fields);
}
otherFields.remove(TITLE);
otherFields.remove(SUMMARY);
otherFields.remove(CONTENT_URL);
otherFields.remove(SOURCES);
otherFields.remove(LANGUAGE);
otherFields.remove(SCORE);
fireSerializationListeners(otherFields);
return otherFields.isEmpty() ? null : otherFields;
}
/*
*
*/
@ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false)
private HashMap getOtherFieldsXml()
{
final HashMap otherFieldsForSerialization;
synchronized (this)
{
otherFieldsForSerialization = MapUtils.asHashMap(SimpleXmlWrappers
.wrap(fields));
}
otherFieldsForSerialization.remove(TITLE);
otherFieldsForSerialization.remove(SUMMARY);
otherFieldsForSerialization.remove(CONTENT_URL);
otherFieldsForSerialization.remove(SOURCES);
otherFieldsForSerialization.remove(LANGUAGE);
otherFieldsForSerialization.remove(SCORE);
fireSerializationListeners(otherFieldsForSerialization);
return otherFieldsForSerialization.isEmpty() ? null : otherFieldsForSerialization;
}
/*
*
*/
@ElementMap(entry = "field", key = "key", attribute = true, inline = true, required = false)
private void setOtherFieldsXml(
HashMap otherFieldsForSerialization)
{
if (otherFieldsForSerialization != null)
{
// No need to synchronize here, the object is being deserialized,
// so it can't yet be seen by other threads.
fields.putAll(SimpleXmlWrappers.unwrap(otherFieldsForSerialization));
}
}
/**
* Returns all fields of this document. The returned map is unmodifiable.
*
* @return all fields of this document
*/
public Map getFields()
{
return fieldsView;
}
/**
* Returns value of the specified field of this document. If no field corresponds to
* the provided name
, null
will be returned.
*
* @param name of the field to be returned
* @return value of the field or null
*/
@SuppressWarnings("unchecked")
public T getField(String name)
{
synchronized (this)
{
return (T) fields.get(name);
}
}
/**
* Sets a field in this document.
*
* @param name of the field to set
* @param value value of the field
* @return this document for convenience
*/
public Document setField(String name, Object value)
{
synchronized (this)
{
fields.put(name, value);
}
return this;
}
/**
* Creates a shallow clone of itself. The identifier
* and the fields map is copied but values inside fields are not cloned.
*/
@Override
public Document clone()
{
Document clone = new Document();
clone.id = this.id;
clone.fields.putAll(this.fields);
return clone;
}
/**
* Assigns sequential identifiers to the provided documents
. If any
* document in the set has a non-empty identifier, no identifiers will be generated at
* all.
*
* @param documents documents to assign identifiers to.
* @throws IllegalArgumentException Thrown if the collection of documents already contains
* identifiers and they are not unique.
*/
public static void assignDocumentIds(Collection documents)
{
// We may get concurrent calls referring to the same documents
// in the same list, so we need to synchronize here.
synchronized (documents)
{
// Make sure there are no identifiers. Or if they are present, they should be unique.
boolean hadIds = false;
for (Document document : documents)
{
if (document.id != null)
{
hadIds = true;
break;
}
}
if (hadIds)
{
final HashSet ids = Sets.newHashSet();
for (Document doc : documents)
{
String id = doc.getStringId();
if (!ids.add(id) && id != null)
{
throw new IllegalArgumentException(
"Identifiers must be unique, duplicated identifier: " + id +
" [existing: " + ids.toString() + "]");
}
}
if (ids.contains(null))
{
throw new IllegalArgumentException(
"Null identifiers cannot be mixed with existing non-null identifiers: " +
" [existing: " + ids.toString() + "]");
}
}
else
{
// All nulls, assign ids.
int id = 0;
for (final Document document : documents)
{
document.id = Integer.toString(id);
id++;
}
}
}
}
/**
* Transforms a {@link Document} to its identifier returned by
* {@link Document#getId()}.
*
* @deprecated Please use #getStringId() directly or use your own {@link Function}
* implementation.
*/
public static final class DocumentToId implements Function
{
public static final DocumentToId INSTANCE = new DocumentToId();
private DocumentToId()
{
}
public Integer apply(Document document)
{
return document.getId();
}
}
/**
* Compares {@link Document}s by their identifiers {@link #getId()}, which effectively
* gives the original order in which they were returned by the document source.
*
* @deprecated semantics of the identifiers depends on the document source, please
* roll your own comparator that is aware of the actual id semantics.
*/
public static final Comparator BY_ID_COMPARATOR = Ordering.natural()
.nullsFirst().onResultOf(DocumentToId.INSTANCE);
/**
* Adds a serialization listener to this document.
*
* @param listener the listener to add
*/
public void addSerializationListener(IDocumentSerializationListener listener)
{
synchronized (this)
{
if (serializationListeners == null)
{
serializationListeners = Lists.newArrayList();
}
serializationListeners.add(listener);
}
}
/**
* Enables listening to events related to XML/JSON serialization of {@link Document}s.
*/
public static interface IDocumentSerializationListener
{
/**
* Called before a {@link Document} gets serialized to XML or JSON. Specific
* implementations may want to modify some properties of the document before it
* gets serialized
*
* @param document the documents being serialized. Note: changes to the document
* will not be undone after serialization completes.
* @param otherFieldsForSerialization custom fields that are about to be
* serialized. Changes made on this map will not affect the contents of
* the document.
*/
public void beforeSerialization(Document document,
Map otherFieldsForSerialization);
}
private void fireSerializationListeners(Map otherFieldsForSerialization)
{
synchronized (this)
{
if (serializationListeners != null)
{
for (IDocumentSerializationListener listener : serializationListeners)
{
listener.beforeSerialization(this, otherFieldsForSerialization);
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy