All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.basistech.rosette.dm.AnnotatedText Maven / Gradle / Ivy

There is a newer version: 3.0.3
Show newest version
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.basistech.rosette.dm;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * The root of the data model. An {@code AnnotatedText} is blob of text and its attributes.
 * The attributes are available from {@link #getAttributes()}, as well as from
 * some convenience accessors, such as {@link #getTokens()} or {@link #getEntities()}.
 * 
* Generally, offsets used in the data model are character (UTF-16 elements) offsets into the * original text. Offset ranges are always half-open. For example: *
 * 012345678901
 * Hello world
 * 
* The token "Hello" has start offset 0 and end offset 5. *
* A note on serialization: due to the internal structure of this class and the classes * that make up the model, we do not recommend that applications serialize this to * Json (or XML or other representations) by applying a reflection-based toolkit 'as-is'. * For Json, and Java, the 'adm-json' module provides the supported serialization. */ @SuppressWarnings("deprecation") public class AnnotatedText implements Serializable { private static final long serialVersionUID = 222L; private final CharSequence data; /* The attributes for this text, indexed by type. * Only one attribute of a type is permitted, thus the concept * of a ListAttribute. */ private final Map attributes; private final Map> documentMetadata; private transient boolean compatMentionsProcessed; private transient ListAttribute compatMentions; private transient boolean compatResolvedEntitiesProcessed; private transient ListAttribute compatResolvedEntities; AnnotatedText(CharSequence data, Map attributes, Map> documentMetadata, /* * This version is only here as a workaround for https://github.com/FasterXML/jackson-databind/issues/1118 * It would be better if it was only mentioned in the mixins. No data arrives here, it just * allows the mixin to be matched with the existence of a version item in the json. */ String version) { this.data = data; // allow incoming json that simply lacks attributes or documentMetadata. this.attributes = absorbAttributes(attributes); if (documentMetadata != null) { this.documentMetadata = ImmutableMap.copyOf(documentMetadata); } else { this.documentMetadata = ImmutableMap.of(); } } /* * This method is called from the constructor. It can encounter 'old' attributes * if there is data coming from old json. */ @SuppressWarnings("unchecked") private Map absorbAttributes(Map attributes) { ImmutableMap.Builder builder = new ImmutableMap.Builder<>(); if (attributes == null) { return ImmutableMap.of(); } ListAttribute sourceEntityList = (ListAttribute) attributes.get(AttributeKey.ENTITY.key()); for (Map.Entry me : attributes.entrySet()) { if (!AttributeKey.RESOLVED_ENTITY.key().equals(me.getKey()) && !AttributeKey.ENTITY_MENTION.key().equals(me.getKey()) // defer entity && !AttributeKey.ENTITY.key().equals(me.getKey())) { builder.put(me); } } // Begin compatibility with '1.0' version of ADM. ListAttribute oldMentions = (ListAttribute)attributes.get(AttributeKey.ENTITY_MENTION.key()); ListAttribute oldResolved = (ListAttribute)attributes.get(AttributeKey.RESOLVED_ENTITY.key()); if (anythingInThere(oldResolved) || anythingInThere(oldMentions)) { ConvertFromPreAdm11.doResolvedConversion(sourceEntityList, oldMentions, oldResolved, builder); } else if (sourceEntityList != null) { builder.put(AttributeKey.ENTITY.key(), sourceEntityList); } if (oldResolved != null && oldResolved.size() == 0) { // In this one special class we need to end up with an empty list. // The code otherwise ends up with null. compatResolvedEntities = new ListAttribute.Builder(ResolvedEntity.class).build(); compatResolvedEntitiesProcessed = true; } return builder.build(); } private static boolean anythingInThere(List list) { return list != null; } /** * Returns the character data for this text. * * @return the character data for this text * @adm.ignore */ public CharSequence getData() { return data; } /** * Returns document-level metadata. Metadata keys are simple strings; * values are lists of strings. * * @return map of metadata associated with the document * @adm.ignore */ public Map> getDocumentMetadata() { return documentMetadata; } /** * Returns all of the annotations on this text. For the defined attributes, * the keys will be values from {@link AttributeKey#key()}. The values * are polymorphic; the subclass of {@link BaseAttribute} depends * on the attribute. Applications should usually prefer to use the * convenience accessors (e.g. {@code getTokens}) instead, to avoid the * need for a cast. * * Note that this map will not return {@link EntityMention} or {@link ResolvedEntity} objects, * which are deprecated; they are only available from the specific accessors. * * @return all of the annotations on this text * * @adm.ignore */ public Map getAttributes() { return attributes; } /** * Returns the list of tokens. * * @return the list of tokens */ @SuppressWarnings("unchecked") public ListAttribute getTokens() { return (ListAttribute) attributes.get(AttributeKey.TOKEN.key()); } /** * Returns the translated tokens. This API allows for multiple * translations. For example, element 0 may contain the {@code TranslatedTokens} * for Simplified Chinese, and element 1 may contain the {@code TranslatedTokens} * for Japanese. Usually only element 0 will be populated. * * @return the list of translated tokens */ @SuppressWarnings("unchecked") public ListAttribute getTranslatedTokens() { return (ListAttribute) attributes.get(AttributeKey.TRANSLATED_TOKENS.key()); } /** * Returns the translations for the text. This API allows multiple * translations. For example, element 0 may contain the {@code TranslatedData} * for Simplified Chinese, and element 1 may contain the {@code TranslatedData} * for Japanese. Usually only element 0 will be populated. * * @return the translations for the text */ @SuppressWarnings("unchecked") public ListAttribute getTranslatedData() { return (ListAttribute) attributes.get(AttributeKey.TRANSLATED_DATA.key()); } /** * Returns the list of language regions. * * @return the list of language regions */ @SuppressWarnings("unchecked") public ListAttribute getLanguageDetectionRegions() { return (ListAttribute) attributes.get(AttributeKey.LANGUAGE_DETECTION_REGIONS.key()); } /** * Returns the language results for the entire text. * * @return the language results for the entire text */ public LanguageDetection getWholeTextLanguageDetection() { return (LanguageDetection)attributes.get(AttributeKey.LANGUAGE_DETECTION.key()); } /** * Returns the list of entity mentions. * * @return the list of entity mentions * @deprecated this constructs a list of the old objects for compatibility, the supported * item is {@link Mention}. * */ @SuppressWarnings("unchecked") @Deprecated public ListAttribute getEntityMentions() { if (!compatMentionsProcessed) { compatMentionsProcessed = true; List entityMentionList = Lists.newArrayList(); ListAttribute entities = getEntities(); if (entities != null) { downconvertEntities(entityMentionList, entities); } else { return null; // null entities = null compat. } ListAttribute.Builder cmListBuilder = new ListAttribute.Builder<>(EntityMention.class); for (EntityMention entityMention : entityMentionList) { cmListBuilder.add(entityMention); } if (entities.getExtendedProperties() != null) { for (Map.Entry me : entities.getExtendedProperties().entrySet()) { String key = me.getKey(); if (key.startsWith("mention.")) { cmListBuilder.extendedProperty(key.substring(8), me.getValue()); } } } compatMentions = cmListBuilder.build(); } return compatMentions; } // Class uses to flatten mentions on their way to EntityMentions. private static class MentionAndEntity { final Mention mention; final Entity entity; MentionAndEntity(Mention mention, Entity entity) { this.mention = mention; this.entity = entity; } } private static void downconvertEntities(List entityMentionList, ListAttribute entities) { /* We need to precalculate the order in which we will deliver them to get the coref chain ids. */ List mentionList = new ArrayList<>(); for (Entity entity : entities) { if (entity.getMentions() != null) { for (Mention mention : entity.getMentions()) { mentionList.add(new MentionAndEntity(mention, entity)); } } } // sort mentions in document order, the order we will return them in. Collections.sort(mentionList, new Comparator() { @Override public int compare(MentionAndEntity o1, MentionAndEntity o2) { if (o1.mention.getStartOffset() == o2.mention.getStartOffset()) { return o1.mention.getEndOffset() - o2.mention.getEndOffset(); } else { return o1.mention.getStartOffset() - o2.mention.getStartOffset(); } } }); // Allow us to find the ordinal position of each mention, so that // we can get from a headMentionIndex to a chain id. Map mentionOrdinals = new HashMap<>(); for (int x = 0; x < mentionList.size(); x++) { mentionOrdinals.put(mentionList.get(x).mention, x); } for (MentionAndEntity mae : mentionList) { Mention mention = mae.mention; Entity entity = mae.entity; // If the conversion process stashed a per-mention type, recover it here. String type = (String) mention.getExtendedProperties().get("old-entity-type"); if (type == null) { // In the new model, it's on the Entity. type = entity.getType(); } EntityMention.Builder emBuilder = new EntityMention.Builder(mention.getStartOffset(), mention.getEndOffset(), type); if (entity.getHeadMentionIndex() != null) { // the coref chain id is the location in the list of the mention in question. emBuilder.coreferenceChainId(mentionOrdinals.get(entity.getMentions().get(entity.getHeadMentionIndex()))); } if (mention.getConfidence() != null) { emBuilder.confidence(mention.getConfidence()); } if (mention.getLinkingConfidence() != null) { emBuilder.linkingConfidence(mention.getLinkingConfidence()); } if (mention.getExtendedProperties() != null && mention.getExtendedProperties().size() > 0) { for (Map.Entry me : mention.getExtendedProperties().entrySet()) { if (!me.getKey().equals("old-entity-type")) { if (me.getKey().equals("oldFlags")) { emBuilder.flags((Integer) me.getValue()); } else if (me.getKey().equals("oldCoreferenceChainId")) { // Do not use this. The coref chain ID has to be aligned with the sorted order. } else { emBuilder.extendedProperty(me.getKey(), me.getValue()); } } } } if (mention.getNormalized() != null) { emBuilder.normalized(mention.getNormalized()); } if (mention.getSource() != null) { emBuilder.source(mention.getSource()); } if (mention.getSubsource() != null) { emBuilder.subsource(mention.getSubsource()); } entityMentionList.add(emBuilder.build()); } } /** * Returns the list of entities. Entities are ordered by the document * order of their head mentions. * * @return the list of entities */ @SuppressWarnings("unchecked") public ListAttribute getEntities() { return (ListAttribute) attributes.get(AttributeKey.ENTITY.key()); } /** * Returns the list of relationship mentions. * * @return the list of relationship mentions */ @SuppressWarnings("unchecked") public ListAttribute getRelationshipMentions() { return (ListAttribute) attributes.get(AttributeKey.RELATIONSHIP_MENTION.key()); } /** * Returns the list of resolved entities. * * @return the list of resolved entities * @deprecated this constructs a list of the old objects for compatibility, the supported item * is {@link Entity}. */ @SuppressWarnings("unchecked") @Deprecated public ListAttribute getResolvedEntities() { if (!compatResolvedEntitiesProcessed) { compatResolvedEntitiesProcessed = true; ListAttribute.Builder reListBuilder = new ListAttribute.Builder<>(ResolvedEntity.class); ListAttribute entities = getEntities(); if (entities == null) { return null; } if (entities.getExtendedProperties() != null) { for (Map.Entry me : entities.getExtendedProperties().entrySet()) { String key = me.getKey(); if (!key.startsWith("mention.")) { reListBuilder.extendedProperty(key, me.getValue()); } } } for (Entity entity : entities) { if (entity.getHeadMentionIndex() == null) { // ignore entities without head mentions. continue; } int headStart = 0; int headEnd = 0; if (entity.getHeadMentionIndex() != null) { Mention head = entity.getMentions().get(entity.getHeadMentionIndex()); headStart = head.getStartOffset(); headEnd = head.getEndOffset(); } ResolvedEntity.Builder reBuilder = new ResolvedEntity.Builder(headStart, headEnd, entity.getEntityId()); if (entity.getConfidence() != null) { reBuilder.confidence(entity.getConfidence()); } if (entity.getSentiment() != null && !entity.getSentiment().isEmpty()) { reBuilder.sentiment(entity.getSentiment().get(0)); } if (entity.getExtendedProperties() != null) { for (Map.Entry me : entity.getExtendedProperties().entrySet()) { if (me.getKey().equals("oldCoreferenceChainId")) { reBuilder.coreferenceChainId((Integer)me.getValue()); } else { reBuilder.extendedProperty(me.getKey(), me.getValue()); } } } reListBuilder.add(reBuilder.build()); } compatResolvedEntities = reListBuilder.build(); if (compatResolvedEntities.size() == 0) { // If no resolved entities survived, don't make it look as if someone specified them. /* But note special case in absorbAttributes when someone used the old API to create an empty list. */ compatResolvedEntities = null; } } return compatResolvedEntities; } /** * Returns the list of script regions. * * @return the list of script regions */ @SuppressWarnings("unchecked") public ListAttribute getScriptRegions() { return (ListAttribute) attributes.get(AttributeKey.SCRIPT_REGION.key()); } /** * Returns the list of sentences. * * @return the list of sentences */ @SuppressWarnings("unchecked") public ListAttribute getSentences() { return (ListAttribute) attributes.get(AttributeKey.SENTENCE.key()); } /** * Returns the list of base noun phrases. * * @return the list of base noun phrases */ @SuppressWarnings("unchecked") public ListAttribute getBaseNounPhrases() { return (ListAttribute) attributes.get(AttributeKey.BASE_NOUN_PHRASE.key()); } /** * Returns the list of categorizer results. * * @return the list of categorizer results */ @SuppressWarnings("unchecked") public ListAttribute getCategorizerResults() { return (ListAttribute) attributes.get(AttributeKey.CATEGORIZER_RESULTS.key()); } /** * Returns the list of sentiment results. * * @return the list of sentiment results */ @SuppressWarnings("unchecked") public ListAttribute getSentimentResults() { return (ListAttribute) attributes.get(AttributeKey.SENTIMENT_RESULTS.key()); } /** * Returns the list of dependencies. * * @return the list of dependencies. */ @SuppressWarnings("unchecked") public ListAttribute getDependencies() { return (ListAttribute) attributes.get(AttributeKey.DEPENDENCY.key()); } /* * Returns the list of topic results. Topics differ from categories in * that there is usually a single best category (e.g. SPORTS) whereas * there may be a number good topics (e.g. sports, basketball, Michael * Jordan). * * @return the list of topic results */ @SuppressWarnings("unchecked") public ListAttribute getTopicResults() { return (ListAttribute) attributes.get(AttributeKey.TOPIC_RESULTS.key()); } /** * Return the embeddings associated with this text. Embeddings, sometimes known * as text vectors, are arrays of floating point numbers calculated from the entire * text or subsets such as tokens or entities. * @return the embeddings. */ public Embeddings getEmbeddings() { return (Embeddings) attributes.get(AttributeKey.EMBEDDING.key()); } @SuppressWarnings("unchecked") public ListAttribute getConcepts() { return (ListAttribute) attributes.get(AttributeKey.CONCEPT.key()); } @SuppressWarnings("unchecked") public ListAttribute getKeyphrases() { return (ListAttribute) attributes.get(AttributeKey.KEYPHRASE.key()); } public TransliterationResults getTransliteration() { return (TransliterationResults) attributes.get(AttributeKey.TRANSLITERATION.key()); } /** * toString is a convenience for accessing the textual data, if any, in this annotated text. * @return the data for this AnnotatedText as a String. * If the data is {@code null}, this returns {@code null} * rather than throwing a {@link NullPointerException}. */ @Override public String toString() { if (data == null) { return null; } else { return data.toString(); } } /** * Builder class for {@link AnnotatedText} objects. */ public static class Builder { private CharSequence data; // Keys are strings to allow for extension. Predefined keys are from // AttributeKey.key(). private final Map attributes = Maps.newHashMap(); private final Map> documentMetadata = Maps.newHashMap(); /** * Constructs a builder. The initial data is the empty string. */ public Builder() { // leave data null. } /** * Constructs a builder from an existing {@link com.basistech.rosette.dm.AnnotatedText}. * * @param startingPoint source object to copy */ public Builder(AnnotatedText startingPoint) { this.data = startingPoint.data; this.attributes.putAll(startingPoint.attributes); this.documentMetadata.putAll(startingPoint.documentMetadata); } /** * Constructs a builder over some character data. * * @param data the data. This replaces and previous setting. * @return this */ public Builder data(CharSequence data) { this.data = data; return this; } /** * Returns the current character data. * * @return the current character data */ public CharSequence data() { return data; } /** * Attaches a list of base noun phrases. * * @param baseNounPhrases the base noun phrases * @return this */ public Builder baseNounPhrases(ListAttribute baseNounPhrases) { attributes.put(AttributeKey.BASE_NOUN_PHRASE.key(), baseNounPhrases); return this; } /** * Attaches a list of entity mentions. * * @param entityMentions the entity mentions * @return this * @deprecated Use {@link #entities(ListAttribute)}. */ @Deprecated public Builder entityMentions(ListAttribute entityMentions) { // a new set of old objects replaces any prior set of new objects. attributes.remove(AttributeKey.ENTITY.key()); attributes.put(AttributeKey.ENTITY_MENTION.key(), entityMentions); return this; } /** * Attaches a list of relationship mentions. * * @param relationshipMentions the relationship mentions. * @return this */ public Builder relationshipMentions(ListAttribute relationshipMentions) { attributes.put(AttributeKey.RELATIONSHIP_MENTION.key(), relationshipMentions); return this; } /** * Attaches a list of entities. * @param entities the entities. * @return this. */ public Builder entities(ListAttribute entities) { // specifying entities replaces the old entity structures. attributes.remove(AttributeKey.ENTITY_MENTION.key()); attributes.remove(AttributeKey.RESOLVED_ENTITY.key()); attributes.put(AttributeKey.ENTITY.key(), entities); return this; } /** * Attaches a list of resolved entities. * * @param resolvedEntities the resolved entities * @return this * @deprecated use {@link #entities(ListAttribute)}. */ @Deprecated @SuppressWarnings("unchecked") public Builder resolvedEntities(ListAttribute resolvedEntities) { if (resolvedEntities != null && !resolvedEntities.isEmpty()) { if (attributes.containsKey(AttributeKey.ENTITY.key())) { // we need to recreate the old mentions to go with 'old' resolved entities. List oldList = Lists.newArrayList(); downconvertEntities(oldList, (ListAttribute) attributes.get(AttributeKey.ENTITY.key())); ListAttribute.Builder oldBuilder = new ListAttribute.Builder<>(EntityMention.class); for (EntityMention em : oldList) { oldBuilder.add(em); } attributes.remove(AttributeKey.ENTITY.key()); attributes.put(AttributeKey.ENTITY_MENTION.key(), oldBuilder.build()); } attributes.put(AttributeKey.RESOLVED_ENTITY.key(), resolvedEntities); } return this; } /** * Attaches a list of language detections. * * @param languageDetectionRegions the language detections * @return this */ public Builder languageDetectionRegions(ListAttribute languageDetectionRegions) { attributes.put(AttributeKey.LANGUAGE_DETECTION_REGIONS.key(), languageDetectionRegions); return this; } /** * Attaches a whole-document language detection. * * @param languageDetection the language detection * @return this */ public Builder wholeDocumentLanguageDetection(LanguageDetection languageDetection) { attributes.put(AttributeKey.LANGUAGE_DETECTION.key(), languageDetection); return this; } /** * Attaches a list of script regions. * * @param scriptRegions the script regions * @return this */ public Builder scriptRegions(ListAttribute scriptRegions) { attributes.put(AttributeKey.SCRIPT_REGION.key(), scriptRegions); return this; } /** * Attaches a list of sentences. * * @param sentences the sentences * @return this */ public Builder sentences(ListAttribute sentences) { attributes.put(AttributeKey.SENTENCE.key(), sentences); return this; } /** * Attaches a list of tokens. * * @param tokens the tokens * @return this */ public Builder tokens(ListAttribute tokens) { attributes.put(AttributeKey.TOKEN.key(), tokens); return this; } /** * Attaches a list of TranslatedTokens objects. * * @param translatedTokens a list of TranslatedTokens objects * @return this */ public Builder translatedTokens(ListAttribute translatedTokens) { attributes.put(AttributeKey.TRANSLATED_TOKENS.key(), translatedTokens); return this; } /** * Attaches a TranslatedData object. * * @param translatedData a TranslatedData object * @return this */ public Builder translatedData(ListAttribute translatedData) { attributes.put(AttributeKey.TRANSLATED_DATA.key(), translatedData); return this; } /** * Attaches a list of categorizer results. * * @param categorizerResults the categorizer results * @return this */ public Builder categorizerResults(ListAttribute categorizerResults) { attributes.put(AttributeKey.CATEGORIZER_RESULTS.key(), categorizerResults); return this; } /** * Attaches a list of sentiment results. * * @param sentimentResults the sentiment results * @return this */ public Builder sentimentResults(ListAttribute sentimentResults) { attributes.put(AttributeKey.SENTIMENT_RESULTS.key(), sentimentResults); return this; } /** * Attaches a list of dependencies. * * @param dependencies the dependencies. * @return this */ public Builder dependencies(ListAttribute dependencies) { attributes.put(AttributeKey.DEPENDENCY.key(), dependencies); return this; } /* * Attaches a list of topic results. * * @param topicResults the topic results * @return this */ public Builder topicResults(ListAttribute topicResults) { attributes.put(AttributeKey.TOPIC_RESULTS.key(), topicResults); return this; } /** * Attaches a set of embeddings. * @param embeddings the embeddings. * @return this. */ public Builder embeddings(Embeddings embeddings) { attributes.put(AttributeKey.EMBEDDING.key(), embeddings); return this; } public Builder concepts(ListAttribute concepts) { attributes.put(AttributeKey.CONCEPT.key(), concepts); return this; } public Builder keyphrases(ListAttribute keyphrases) { attributes.put(AttributeKey.KEYPHRASE.key(), keyphrases); return this; } public Builder transliteration(TransliterationResults transliterationResults) { attributes.put(AttributeKey.TRANSLITERATION.key(), transliterationResults); return this; } /** * Adds an attribute. * * @param key the attribute key. See {@link AttributeKey}. * @param attribute the attribute. Replaces any previous value for this key. * @return this */ Builder attribute(String key, BaseAttribute attribute) { attributes.put(key, attribute); return this; } /** * Adds an attribute. * * @param key the attribute key. * @param attribute the attribute. Replaces any previous value for this key. * @return this */ Builder attribute(AttributeKey key, BaseAttribute attribute) { attributes.put(key.key(), attribute); return this; } /** * Returns the current attributes. * * @return the current attributes */ public Map attributes() { return attributes; } /** * Adds an entry to the document metadata. Replaces any previous value for this key. * * @param key key * @param value value * @return this */ public Builder documentMetadata(String key, List value) { documentMetadata.put(key, ImmutableList.copyOf(value)); return this; } /** * Add all of the contents of a map of metadata to the document metadata. * * @param mapOfValues a map from keys to values. * @return this */ public Builder documentMetadata(Map> mapOfValues) { for (Map.Entry> me : mapOfValues.entrySet()) { documentMetadata.put(me.getKey(), ImmutableList.copyOf(me.getValue())); } return this; } /** * Adds an entry to the document metadata. Replaces any previous value for this key. * * @param key key * @param value A single string value. The result of this call is to store a list containing this value * as the value for this key. * @return this */ public Builder documentMetadata(String key, String value) { documentMetadata.put(key, Lists.newArrayList(value)); return this; } /** * Returns the current document metadata. * * @return the current document metadata */ public Map> documentMetadata() { return documentMetadata; } /** * Constructs a {@link AnnotatedText} object from the settings in this builder. * * @return the new object */ public AnnotatedText build() { return new AnnotatedText(data, attributes, documentMetadata, null); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy