
org.terrier.indexing.FlatJSONDocument Maven / Gradle / Ivy
The newest version!
package org.terrier.indexing;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.terrier.indexing.tokenisation.Tokeniser;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonObject;
/**
* This is a Terrier Document implementation of a document stored in JSON format. It assumes that
* a single JSON document has at least a single attribute called 'text' that contains the text of
* the document.
*
* Fields:
* This implementation supports a single field named 'TEXT' by default. FieldTags.process is a
* comma delimited list of properties to use as fields.
*
* Meta-Data:
* During the parsing process, the properties of each FlatJSONDocument is decorated with document meta-data.
* This decoration process is performed by 'flattening' the layered structure of the JSON object and its
* sub-attributes into individual properties. For property naming, attributes in different layers are connected
* with a dot '.', e.g. user.name
*
* @author Richard McCreadie and Saul Vargas
* @since 5.1
*/
public class FlatJSONDocument implements Document {
// constructor properties
protected Map properties;
protected Tokeniser tokenizer;
public String[][] tokens;
protected List fieldQueue;
Map> fieldSet;
// static properties
protected String[] fieldsToProcess = ArrayUtils.parseCommaDelimitedString(
ApplicationSetup.getProperty("FlatJSONDocument.process",
ApplicationSetup.getProperty("FieldTags.process", "text"))
);
// state
protected int fieldIndex = 0;
protected int tokenIndex = -1;
protected int remainingTokens;
public FlatJSONDocument(JsonObject json) {
initalize(json.toString());
}
public FlatJSONDocument(String rawJson) {
initalize(rawJson);
}
@SuppressWarnings("unchecked")
protected void initalize(String rawJson) {
try {
this.tokenizer = Tokeniser.getTokeniser();
ObjectMapper mapper = new ObjectMapper();
Map
© 2015 - 2025 Weber Informatics LLC | Privacy Policy