All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sindicetech.siren.analysis.ExtendedJsonTokenizer Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (c) 2014, Sindice Limited. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package com.sindicetech.siren.analysis;

import org.apache.lucene.util.AttributeFactory;

import java.io.IOException;
import java.io.Reader;

/**
 * A tokenizer for JSON documents.
 * 

* The tokenizer parses JSON data and generates a token for each field name, object and value. * * @see com.sindicetech.siren.analysis.AbstractJsonTokenizer * @see ExtendedJsonScanner */ public class ExtendedJsonTokenizer extends AbstractJsonTokenizer { private final ExtendedJsonScanner scanner; public ExtendedJsonTokenizer(final Reader input) { super(input); scanner = new ExtendedJsonScanner(input); } public ExtendedJsonTokenizer(final AttributeFactory factory, final Reader input) { super(factory, input); scanner = new ExtendedJsonScanner(input); } @Override protected boolean nextToken() throws IOException { final int tokenType = scanner.getNextToken(); switch (tokenType) { case FALSE: termAtt.append("false"); this.updateToken(tokenType, scanner.yychar()); break; case TRUE: termAtt.append("true"); this.updateToken(tokenType, scanner.yychar()); break; case NULL: termAtt.append("null"); this.updateToken(tokenType, scanner.yychar()); break; case NUMBER: scanner.getLiteralText(termAtt); this.updateToken(tokenType, scanner.yychar()); break; case LITERAL: scanner.getLiteralText(termAtt); this.updateToken(tokenType, scanner.yychar() + 1); break; default: return false; } return true; } /** * Update type, datatype, offset, tuple id and cell id of the token * * @param tokenType The type of the generated token * @param startOffset The starting offset of the token */ private void updateToken(final int tokenType, final int startOffset) { // Update offset offsetAtt.setOffset(this.correctOffset(startOffset), this.correctOffset(startOffset + termAtt.length())); // update token type typeAtt.setType(TOKEN_TYPES[tokenType]); // update datatype dtypeAtt.setDatatypeURI(scanner.getDatatypeURI()); // Update structural information nodeAtt.copyNode(scanner.getNodePath()); } @Override public void reset() throws IOException { super.reset(); if (input.markSupported()) { input.reset(); } scanner.yyreset(input); } @Override public void close() throws IOException { super.close(); scanner.yyclose(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy