org.opensextant.annotations.AnnotationHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensextant-xponents-core Show documentation
Show all versions of opensextant-xponents-core Show documentation
An information extraction toolkit focused on geography and temporal entities
/*
* IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
*
* OpenSextant/Xponents sub-project
* __
* ___/ /___ ___ ___ ___ __ __ ___
* / _ // -_)/ -_)/ _ \/ -_)/ // // -_)
* \_,_/ \__/ \__// .__/\__/ \_, / \__/
* /_/ /___/
*
* IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
* Copyright 2013, 2019 MITRE Corporation
*/
package org.opensextant.annotations;
import static org.opensextant.util.GeodeticUtility.isValidNonZeroCoordinate;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.opensextant.data.Country;
import org.opensextant.data.Geocoding;
import org.opensextant.data.Place;
import org.opensextant.data.Taxon;
import org.opensextant.util.TextUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import jodd.json.JsonObject;
/**
*
* Basis for this optional helper class was three or four different projects using DeepEye as
* a model for persisting annotations from the typical Named Entity and Geo/Time extraction work.
*
* Common annotation practices include:
* - All annotations should have both their own ID as well as a Record ID; this involves tracking unique annots for a given doc.
* The intent of DeepEye identifiers is that they are unique within a system or a data set, not necessarily UUID:
* - Record IDs should be related to their source identifier
* - Annotation IDs should be deterministic based on metadata tuple: MD5( name + value + contrib + rec_id ) or a similar predictable, reproducible hash
* - Caching an consolidating repetitive annotations. Example: seeing 'U.S.' 100 times in a
* single document as a "GPE" or as a "geo" can be overwhelming. Is it 100 individual annots,
* or 1 annotation for GPE and 1 annotation for the geo. Each annotation tracks as many offsets as it needs.
* As individual instances in the document vary in attributes or other metadata, then those
* should be considered unique annotations.
* - Allow common doc-level annotations, such as a topic tag, even when there is no offset or text span in question. "Span-less" annotations.
*
* Advice for what to persist to a DeepEye database:
* - its costly or complex to compute
* - its helpful to aggregate many raw extracted values to review over a large data set
*
* Consider not storing:
* - annotations that are trivial to compute at runtime, e.g., indexing a derived metadata tag,
* such as converting 'US' to 'United States'
* - NLP artifacts such as tokens like pronouns or other parts of speech. These might be
* - Filtered values -- if you are filtering out certain data in all your analyses or downstream
* operations, consider filtering out such things before you store them blindly.
*
*
* SEE ALSO: Xponents SDK class org.opensextant.output.Transform: this utility class offers more ideas on standard JSON representations for REST.
* whereas this utility is aimed at a more reliable pure representation of the match data for storing/retrieving from a data store.
* @author ubaldino
*
*/
public class AnnotationHelper {
/** Start of data quality logging */
private static Logger logger = LoggerFactory.getLogger(AnnotationHelper.class);
/**
* Use an instance if you are using any sort of caching for convenience.
*/
public AnnotationHelper() {
}
/**
* Given encoded annotations from db, decode them and yield a flattened set of annotations, e.g.,
* for use with MAT
*
* @param codedAnnots the coded annots
* @return the list
*/
public static List decodeAnnotations(List codedAnnots) {
List annots = new ArrayList();
for (Annotation a : codedAnnots) {
if (a.offset >= 0) {
annots.add(a);
continue;
}
// If no attrs, just add Annotation.
if (a.attrs == null) {
annots.add(a);
} else if (a.attrs.containsKey("offsets")) {
// ATTRS not null, if 'offsets' is used, decoded it. This annoation appears at those offsets.
annots.addAll(decodeOffsets(a, a.attrs.getString("offsets")));
} else {
// No attributes, no offset, etc.
// This is just an ordinary annotation for the document. E.g., classifier label.
annots.add(a);
}
}
return annots;
}
/**
* Gets the first offset.
*
* @param a the a
* @return the first offset
*/
public static long getFirstOffset(Annotation a) {
if (a.offset >= 0) {
return a.offset;
}
if (a.attrs == null || a.attrs.containsKey("offsets")) {
return -1;
}
String val = a.attrs.getString("offsets");
if (StringUtils.isBlank(val)) {
return -1;
}
List offsets = decodeOffsets(val);
if (offsets.isEmpty()) {
return -1;
}
return offsets.get(0).intValue();
}
/** The Constant NUM_SEP. */
public final static String NUM_SEP = ";";
/**
* Encode offsets.
*
* @param offsets the offsets
* @return the string
*/
public static String encodeOffsets(Collection offsets) {
return StringUtils.join(offsets, NUM_SEP);
}
/**
* Take a list of numbers and convert to Integer list
*
* "1;5;89;777" => List<> [ 1, 5, 89, 777 ].
*
* @param list the list
* @return the list
*/
public static List decodeOffsets(String list) {
List numarray = new ArrayList();
String[] nums = list.split(NUM_SEP);
for (String n : nums) {
numarray.add(Integer.parseInt(n));
}
return numarray;
}
/**
* Generate annotations in a linear fashion. Given the optimized Annotation, A, create duplicate
* annotations, each with an offset from the list of offsets.
*
* @param meta the meta
* @param offsetList the offset list
* @return the list
*/
public static List decodeOffsets(Annotation meta, String offsetList) {
List annots = new ArrayList();
List _offsets = decodeOffsets(offsetList);
// Shallow hashmap, not complex.
JsonObject copyAttrs = new JsonObject();
copyAttrs.mergeIn(meta.attrs);
copyAttrs.remove("offsets");
for (Integer x : _offsets) {
// COPY annotation "meta" to "a"
//
Annotation a = new Annotation(meta.id);
a.contrib = meta.contrib;
a.value = meta.value;
a.name = meta.name;
a.rec_id = meta.rec_id;
a.source_id = meta.source_id;
// Set offset here.
a.offset = x.intValue();
a.attrs = copyAttrs;
annots.add(a);
}
return annots;
}
/**
* New, required format for an annotation ID: Md5 hash made up of:
*
*
* rec_id + contributor + type + value
*
* Distinct entities from a single contributor for a single document will be recorded (and overwritten over time).
* Reprocessing data will overwrite a new value.
*
* Doc abc has a NAMEX = 'the diplomat', provided by xyz extractor.
*
* key is MD5( 'abc;xyz;NAMEX;the diplomat' )
* Multiple occurrences of the same value in the same document must be recorded as "atts.offsets" = [n1,n2,n3...] offsets
*
*
*
* @param rec_id the rec_id
* @param contrib the contrib
* @param _type the _type
* @param val the val
* @return the annotation id
*/
public static String getAnnotationId(String rec_id, String contrib, String _type, String val) {
try {
return TextUtils.text_id(String.format("%s;%s;%s;%s", rec_id, contrib, _type, val));
} catch (Exception err) {
// This is an ODD and rare error to have with an MD5 or other digest.
err.printStackTrace();
}
return null;
}
/**
* Reset() clears the internal cache. Ideally, you hit reset on each new Record in a loop.
*/
public void reset() {
deepEyeEntities.clear();
}
/**
* optimization: keep list of annotations stored in database to a minimum. deepeEyeEntities is a
* hash of Annotation organized by distinct name/value pairs. The offsets of the entity values is
* the only thing that appears to change.
*
*/
private Map deepEyeEntities = new HashMap();
/**
* Gets the cached annotations, unordered.
*
* @return the cached annotations
*/
public Collection getCachedAnnotations() {
if (deepEyeEntities.isEmpty()) {
return null;
}
return deepEyeEntities.values();
}
/**
* Cache entity annotations, accumulating unique offsets for a name/value pair.
*
* @param contrib the contrib
* @param etype the etype
* @param value the value
* @param start the start
* @param docid the docid
* @return the entity annotation
*/
public Annotation cacheAnnotation(String contrib, String etype, String value, int start, String docid) {
String key = cacheKey(etype, value);
Annotation ea = deepEyeEntities.get(key);
if (ea == null) {
ea = createAnnotation(contrib, etype, value, -1, docid);
deepEyeEntities.put(key, ea);
}
ea.addOffset(start);
return ea;
}
/**
* Cache an annotation.
*
* @param ea the ea
* @param key the key
*/
public void cacheAnnotation(Annotation ea, String key) {
deepEyeEntities.put(key, ea);
}
/**
* Cache annotation.
*
* @param ea your annotation
*/
public void cacheAnnotation(Annotation ea) {
deepEyeEntities.put(cacheKey(ea.name, ea.value), ea);
}
/**
* Cache entity annotation - in Memory; Note, the actual ID or key in database is usually composed
* of name+value+contrib. NOTE: these are normalized case-insensitive values. NOTE: If annotation
* already exists, then all we do is add the start offset to the existing entry.
*
* Name and value must not be null.
* @param ea your annotation
* @param start offset into your doc.
* @throws NullPointerException if name and value are not set on Annotation.
*/
public void cacheAnnotation(Annotation ea, int start) {
String key = cacheKey(ea.name, ea.value.toLowerCase());
Annotation eaCurr = deepEyeEntities.get(key);
/*
* Only additional metadata here is offset.
*/
if (eaCurr != null) {
eaCurr.addOffset(start);
} else {
deepEyeEntities.put(key, ea);
}
}
/**
* Checks for cached annotation.
*
* @param etype the etype
* @param value the value
* @return true, if successful
*/
public boolean hasCachedAnnotation(String etype, String value) {
String key = cacheKey(etype, value);
return (deepEyeEntities.get(key) != null);
}
/**
* Cache Key helps distinguish a distinct entity/value pair. Caller could decide if LOC="Boston" and
* LOC="BOSTON" are different or the same by passing in the lower case value.
*
* @param n the n
* @param v the v
* @return the string
*/
private String cacheKey(String n, String v) {
return String.format("%s;%s", n, v);
}
/**
* Careful -- no guarntee that two entity annotations could share the same type/value
* unintentionally.
*
* e.g., if "tx" type annot implies a taxon from one contrib and "tx" means a transaction from
* another, then you the developer should choose more distinct entity type codes.
*
* @param etype entity type
* @param value value
* @return the cached annotation
*/
public Annotation getCachedAnnotation(String etype, String value) {
String key = cacheKey(etype, value);
return deepEyeEntities.get(key);
}
/**
* Cache taxon entity annotation.
*
* @param contrib contributor ID
* @param taxon taxon obj
* @param value string value
* @param offset offset
* @param docid docid
* @return the entity annotation
*/
protected Annotation cacheTaxonAnnotation(String contrib, Taxon taxon, String value, int offset, String docid) {
String key = String.format("%s;%s", taxon.name, value);
Annotation ea = deepEyeEntities.get(key);
if (ea == null) {
ea = createTaxonAnnotation(contrib, "tx", value, -1, docid, taxon);
deepEyeEntities.put(key, ea);
}
ea.addOffset(offset);
return ea;
}
/**
* Creates a standard named entity annotation.
*
* @param contrib contributor ID
* @param type annotation type/ID
* @param val string value
* @param offset offset
* @param docid docid
* @return the annotation
*/
public static Annotation createAnnotation(String contrib, String type, String val, int offset, String docid) {
String _id = getAnnotationId(docid, contrib, type, val);
Annotation annot = new Annotation(_id, docid, contrib, type, val);
annot.offset = offset;
annot.value = val;
return annot;
}
/**
*
* @param contrib
* @param type
* @param val
* @param offset
* @param len
* @param docid
* @return
*/
public static Annotation createAnnotation(String contrib, String type, String val, int offset, int len,
String docid) {
Annotation ea = createAnnotation(contrib, type, val, offset, docid);
ea.setLength(len);
return ea;
}
/**
* Create an annotation for a Taxon node that has a found value, val, in document, docid at offset.
* Taxon match has a type and a contributor, usually the tagger or extractor that processed the
* document.
*
* @param contrib the contrib
* @param type the type
* @param val the val
* @param offset the offset
* @param docid the docid
* @param n the n
* @return the entity annotation
*/
public static Annotation createTaxonAnnotation(String contrib, String type, String val, int offset, String docid,
Taxon n) {
String _id = getAnnotationId(docid, contrib, type, val);
Annotation annot = new Annotation(_id, docid, contrib, type, val);
annot.newAttributes();
annot.attrs.put("name", n.name);
annot.attrs.put("cat", n.catalog);
annot.offset = offset;
return annot;
}
/**
* Recreates a Taxon from a stored annotation. Required fields:
*
* a.attrs[name] -- taxon node name a.attrs[cat] -- catalog a.name -- Not used here. a.value -- the
* value of the matched text.
*
* @param a the a
* @return the taxon
*/
public static Taxon createTaxon(Annotation a) {
if (a.attrs == null) {
return null;
}
Taxon t = new Taxon();
t.catalog = a.attrs.getString("cat");
t.name = a.attrs.getString("name");
return t;
}
/**
* Tracking a country name match of some sort. You know this is a country, eh,... so please enrich
* with the country code here. We know you can always find out the country code later from a given
* country name/match, however this may be context specific.
*
* Georgia / GE -- putting the country code here gives more confidence that you found Georgia, the
* country and not the US state You might have other means for deriving the country code for a given
* value, e.g.,
*
* for example you found "GOI" a geopolitical entity you infer to be Govt. of India, so you emit
* "IN" as the country code.
*
* create( xxx, 'GPE', 'GOI', ..., 'IN' )
*
* @param contrib the contrib
* @param type the type
* @param val the val
* @param offset the offset
* @param docid the docid
* @param country_code the country_code
* @return the entity annotation
*/
public static Annotation createCountryAnnotation(String contrib, String type, String val, int offset, String docid,
String country_code) {
Annotation annot = createAnnotation(contrib, type, val, offset, docid);
if (country_code != null) {
annot.newAttributes();
annot.attrs.put("cc", country_code);
// Normalize the country mention, if possible.
}
return annot;
}
/**
* Returns an instance of a Country object using annotation value as country name, and attr[cc]
* optionally as code. This does not reproduce a full Country object as if queried from
* @see org.opensextant.util.GeonamesUtility#getCountry(String)
*
* @param a annot
* @return the country
*/
public static Country createCountry(Annotation a) {
if (a.attrs == null) {
return null;
}
Country c = new Country(a.attrs.getString("cc"), a.value);
return c;
}
/**
* Encode geocoding annotations to be saved. This schema follows from EH/GLINT/Glare, etc.
*
* @param contrib the contrib
* @param type the type
* @param val the val
* @param offset the offset
* @param docid the docid
* @param g the g
* @return the entity annotation
*/
public static Annotation createGeocodingAnnotation(String contrib, String type, String val, int offset,
String docid, Geocoding g) {
Annotation annot = createAnnotation(contrib, type, val, offset, docid);
annot.newAttributes();
if (g.getCountryCode() != null) {
annot.attrs.put("cc", g.getCountryCode());
}
if (g.getAdmin1() != null) {
annot.attrs.put("adm1", g.getAdmin1());
}
if (g.getFeatureClass() != null) {
annot.attrs.put("feat_class", g.getFeatureClass());
annot.attrs.put("feat_code", g.getFeatureCode());
}
if (isValidNonZeroCoordinate(g.getLatitude(), g.getLongitude())) {
annot.attrs.put("prec", g.getPrecision());
annot.attrs.put("lat", g.getLatitude());
annot.attrs.put("lon", g.getLongitude());
}
if (g.getMethod() != null) {
annot.attrs.put("method", g.getMethod());
}
if (g.getAdminName() != null) {
annot.attrs.put("adm1_name", g.getAdminName());
} else if (g.getAdmin1Name() != null) {
annot.attrs.put("adm1_name", g.getAdmin1Name());
}
return annot;
}
private static double parseDouble(Object o) {
if (o instanceof String) {
return Double.parseDouble(o.toString());
}
if (o instanceof Double) {
return (double) o;
}
throw new NumberFormatException("Unknown object" + o);
}
/**
* Decode: Geocoding See OpenSextant Geocoding interface. Here required annotation fields are:
*
* lat, lon, prec cc, adm1, place feat_class, feat_code method
*
* @param a the a
* @return the geocoded data
*/
public static Place createGeocoding(Annotation a) {
if (a.attrs == null) {
return null;
}
//
Place geo = new Place(a.id, a.value);
if (a.attrs.containsKey("lat")) {
Object olat = a.attrs.getValue("lat");
Object olon = a.attrs.getValue("lon");
double lat = parseDouble(olat);
double lon = parseDouble(olon);
if (isValidNonZeroCoordinate(lat, lon)) {
geo.setLatitude(lat);
geo.setLongitude(lon);
int prec = a.attrs.getInteger("prec", -1);
if (prec < 0) {
prec = a.attrs.getInteger("precision", -1);
}
geo.setPrecision(prec);
if (geo.getPrecision() <= 0) {
logger.info("Location should have precision: " + a.attrs);
}
} else {
logger.info("GEOLOCATION warning: 0,0: ID={}, {}", a.rec_id, a.attrs);
}
}
geo.setMethod(a.attrs.getString("method", null));
geo.setAdmin1(a.attrs.getString("adm1", null));
geo.setAdmin2(a.attrs.getString("adm2", null));
geo.setCountryCode(a.attrs.getString("cc", null));
geo.setFeatureClass(a.attrs.getString("feat_class", null));
geo.setFeatureCode(a.attrs.getString("feat_code", null));
// Optional items:
//geo.setName(a.attrs.optString("place"));
geo.setAdmin1Name(a.attrs.getString("adm1_name", null));
geo.setAdmin2Name(a.attrs.getString("adm2_name", null));
/*
* Choose finest grained admin name.
* First ADM2 name then ADM1 name...
*
*/
if (geo.getAdmin2Name() != null) {
geo.setAdminName(geo.getAdmin2Name());
} else if (geo.getAdmin1Name() != null) {
geo.setAdminName(geo.getAdmin1Name());
}
return geo;
}
/** The datefmt. */
private static DateTimeFormatter DATEFMT = DateTimeFormat.forPattern("yyyy-MM-dd");
/**
* Creates the temporal entity annotation.
*
* @param contrib the contrib
* @param type the type
* @param val the val
* @param offset the offset
* @param docid the docid
* @param d the d
* @param resolution the resolution
* @return the entity annotation
*/
public static Annotation createTemporalEntityAnnotation(String contrib, String type, String val, int offset,
String docid, Date d, String resolution) {
Annotation annot = createAnnotation(contrib, type, val, offset, docid);
annot.newAttributes();
annot.attrs.put("datenorm", DATEFMT.print(d.getTime())); // formatted date/time.
annot.attrs.put("epoch", d.getTime()); // Milliseconds
annot.attrs.put("res", resolution); // finest resolution of date - Y, M,
// D, H, m, s; Week (W)
return annot;
}
/**
* Same createTemporalEntityAnnotation, just with len param.
*
* @param contrib
* @param type
* @param val
* @param offset
* @param len
* @param docid
* @param d
* @param resolution
* @return
*/
public static Annotation createTemporalAnnotation(String contrib, String type, String val, int offset, int len,
String docid, Date d, String resolution) {
Annotation annot = createTemporalEntityAnnotation(contrib, type, val, offset, docid, d, resolution);
annot.setLength(len);
return annot;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy