
com.sindicetech.siren.analysis.AbstractJsonTokenizer Maven / Gradle / Ivy
The newest version!
/**
* Copyright (c) 2014, Sindice Limited. All Rights Reserved.
*
* This file is part of the SIREn project.
*
* SIREn is a free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* SIREn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public
* License along with this program. If not, see .
*/
package com.sindicetech.siren.analysis;
import com.sindicetech.siren.analysis.attributes.DatatypeAttribute;
import com.sindicetech.siren.analysis.attributes.JsonNodeAttributeImpl;
import com.sindicetech.siren.analysis.attributes.NodeAttribute;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import java.io.IOException;
import java.io.Reader;
/**
* Abstraction over the tokenizers for JSON documents. The tokenizer is mapping the JSON model to a SIREn's tree model.
*
* The tokenizer returns a token per node in the tree. It attaches to each token a node identifier using the
* {@link NodeAttribute} and a datatype using the {@link DatatypeAttribute}.
*
* It is mandatory for the tokenizer to return tokens following the natural order of the node identifiers.
*
* Regarding datatype, the convention is the following:
*
* - If a field name is parsed, the datatype
* {@link com.sindicetech.siren.util.JSONDatatype#JSON_FIELD} is assigned;
*
- If a value string is parsed, the datatype
* {@link com.sindicetech.siren.util.XSDDatatype#XSD_STRING} is assigned;
*
- If a boolean value is parsed, the datatype
* {@link com.sindicetech.siren.util.XSDDatatype#XSD_BOOLEAN} is assigned;
*
- If a numerical value is parsed, the datatype
* {@link com.sindicetech.siren.util.XSDDatatype#XSD_LONG} is assigned;
*
- If a numerical value with a fraction is parsed, the datatype
* {@link com.sindicetech.siren.util.XSDDatatype#XSD_DOUBLE} is assigned;
*
*/
public abstract class AbstractJsonTokenizer extends Tokenizer {
/// Token Definition
public static final int NULL = 0;
public static final int TRUE = 1;
public static final int FALSE = 2;
public static final int NUMBER = 3;
public static final int LITERAL = 4;
/**
* Datatype JSON schema: field for the datatype label
*/
public static final String DATATYPE_LABEL = "_datatype_";
/**
* Datatype JSON schema: field for the datatype value
*/
public static final String DATATYPE_VALUES = "_value_";
public AbstractJsonTokenizer(final Reader input) {
super(input);
this.initAttributes();
}
public AbstractJsonTokenizer(final AttributeFactory factory, final Reader input) {
super(factory, input);
this.initAttributes();
}
protected static String[] TOKEN_TYPES = getTokenTypes();
public static String[] getTokenTypes() {
if (TOKEN_TYPES == null) {
TOKEN_TYPES = new String[5];
TOKEN_TYPES[NULL] = "";
TOKEN_TYPES[TRUE] = "";
TOKEN_TYPES[FALSE] = "";
TOKEN_TYPES[NUMBER] = "";
TOKEN_TYPES[LITERAL] = "";
}
return TOKEN_TYPES;
}
// A ExtendedJsonTokenizer contains at least 6 attributes:
// term, offset, positionIncrement, type, datatype, node
protected CharTermAttribute termAtt;
protected OffsetAttribute offsetAtt;
protected PositionIncrementAttribute posIncrAtt;
protected TypeAttribute typeAtt;
protected DatatypeAttribute dtypeAtt;
protected NodeAttribute nodeAtt;
private void initAttributes() {
termAtt = this.addAttribute(CharTermAttribute.class);
offsetAtt = this.addAttribute(OffsetAttribute.class);
posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
typeAtt = this.addAttribute(TypeAttribute.class);
dtypeAtt = this.addAttribute(DatatypeAttribute.class);
if (!this.hasAttribute(NodeAttribute.class)) {
this.addAttributeImpl(new JsonNodeAttributeImpl());
}
nodeAtt = this.addAttribute(NodeAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
this.clearAttributes();
posIncrAtt.setPositionIncrement(1);
return this.nextToken();
}
/**
* Advances to the next token and updates the attributes.
*/
protected abstract boolean nextToken() throws IOException;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy