
gate.corpora.json.JSONDocument Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of format-json Show documentation
Show all versions of format-json Show documentation
Document Format plugin to support reading and writing general JSON files
The newest version!
/*
* Copyright (c) 1995-2018, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 3, June 2007 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*/
package gate.corpora.json;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import gate.Factory;
import gate.FeatureMap;
import gate.corpora.RepositioningInfo;
public class JSONDocument {
private String string;
private Set annotations;
public Set getAnnotations() {
return this.annotations;
}
public int getLength() {
return this.string.length();
}
public String getString() {
return this.string;
}
/**
* Used by the JSONTWeetFormat; the DocumentContent contains only the main text;
* the annotation feature map contains all the other JSON data, recursively.
*/
protected JSONDocument(JsonNode json, String textPath, boolean handleEntities) {
string = "";
annotations = new HashSet();
String[] path = textPath.split("\\.");
JsonNode current = json;
for (int i = 0 ; i < path.length - 1 ; ++i) {
current = json.get(path[i]);
}
RepositioningInfo repos = new RepositioningInfo();
string = unescape(((ObjectNode)current).remove(path[path.length-1]).asText(), repos);
if(handleEntities) processEntities(current, 0L, repos);
Iterator keys = json.fieldNames();
FeatureMap features = Factory.newFeatureMap();
while (keys.hasNext()) {
String key = keys.next();
features.put(key, JSONUtils.process(json.get(key)));
}
annotations.add(new PreAnnotation(0L, string.length(), JSONUtils.ANNOTATION_TYPE, features));
}
/**
* Characters to account for in unescaping - HTML-encoded ampersand and angle
* brackets, and supplementary characters (which don't need "unescaping" but do
* need to be accounted for in the repos info).
*/
private static Pattern UNESCAPE_PATTERN = Pattern.compile("&(?:amp|lt|gt);|[\\x{" +
Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ "}-\\x{" +
Integer.toHexString(Character.MAX_CODE_POINT) + "}]");
/**
* Un-escape &, > and < in the given string, populating
* the supplied {@link RepositioningInfo} to describe the offset changes. Also
* record the position of any Unicode supplementary characters, as Twitter's
* entities format counts in characters (so a supplementary is 1) whereas GATE
* annotations count in Java char
values (UTF-16 code units, so
* a supplementary counts as two).
* @param str string, possibly including escaped ampersands or angle brackets
* @param repos {@link RepositioningInfo} to hold offset changes
* @return the unescaped string
*/
private String unescape(String str, RepositioningInfo repos) {
StringBuffer buf = new StringBuffer();
int origOffset = 0;
int extractedOffset = 0;
Matcher mat = UNESCAPE_PATTERN.matcher(str);
while(mat.find()) {
if(mat.start() != origOffset) {
// repositioning record for the span from end of previous match to start of this one
int nonMatchLen = mat.start() - origOffset;
repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset, nonMatchLen);
origOffset += nonMatchLen;
extractedOffset += nonMatchLen;
}
// in most cases the original length is the number of code units the pattern matched
int origLen = mat.end() - mat.start();
// and the extracted result is one code unit
int extractedLen = 1;
String replace = "?";
switch(mat.group()) {
case "&": replace = "&"; break;
case ">": replace = ">"; break;
case "<": replace = "<"; break;
default:
// but in the case of supplementary characters, the original length
// (in *characters*) is 1 but the extracted length (in code units) is 2
replace = mat.group();
origLen = 1;
extractedLen = 2;
}
mat.appendReplacement(buf, replace);
// repositioning record covering this match
repos.addPositionInfo(origOffset, origLen, extractedOffset, extractedLen);
origOffset += origLen;
extractedOffset += extractedLen;
}
int tailLen = str.length() - origOffset;
if(tailLen > 0) {
// repositioning record covering everything after the last match
repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen + 1);
}
mat.appendTail(buf);
return buf.toString();
}
/**
* Process the "entities" property of this json object into annotations,
* shifting their offsets by the specified amount.
*
* @param json the Tweet json object
* @param startOffset offset correction if the text is not the first of
* the content keys.
*/
private void processEntities(JsonNode json, long startOffset, RepositioningInfo repos) {
JsonNode entitiesNode = ((ObjectNode)json).remove(JSONUtils.ENTITIES_ATTRIBUTE);
if(entitiesNode == null || !entitiesNode.isObject()) {
// no entities, nothing to do
return;
}
Iterator entityTypes = entitiesNode.fieldNames();
while(entityTypes.hasNext()) {
String entityType = entityTypes.next();
JsonNode entitiesOfType = entitiesNode.get(entityType);
if(entitiesOfType != null && entitiesOfType.isArray() && entitiesOfType.size() > 0) {
// if the entityType is X:Y then assume X is the AS name and Y is the actual type
String[] setAndType = entityType.split(":", 2);
Iterator it = entitiesOfType.elements();
while(it.hasNext()) {
JsonNode entity = it.next();
if(entity.isObject()) {
// process is guaranteed to return a FeatureMap given an object
FeatureMap features = (FeatureMap)JSONUtils.process(entity);
Object indices = features.get("indices");
if(indices != null && indices instanceof List>) {
List> indicesList = (List>)indices;
if(indicesList.get(0) instanceof Number && indicesList.get(1) instanceof Number) {
// finally we know we have a valid entity
features.remove("indices");
long annStart = repos.getExtractedPos(startOffset + ((Number)indicesList.get(0)).longValue());
long annEnd = repos.getExtractedPos(startOffset + ((Number)indicesList.get(1)).longValue());
if(setAndType.length == 2) {
// explicit annotation set name
annotations.add(new PreAnnotation(annStart, annEnd, setAndType[0], setAndType[1], features));
} else {
annotations.add(new PreAnnotation(annStart, annEnd, entityType, features));
}
}
}
}
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy