All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.coode.owlapi.manchesterowlsyntax.ManchesterOWLSyntaxTokenizer Maven / Gradle / Ivy

/*
 * This file is part of the OWL API.
 *
 * The contents of this file are subject to the LGPL License, Version 3.0.
 *
 * Copyright (C) 2011, The University of Manchester
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/.
 *
 *
 * Alternatively, the contents of this file may be used under the terms of the Apache License, Version 2.0
 * in which case, the provisions of the Apache License Version 2.0 are applicable instead of those above.
 *
 * Copyright 2011, University of Manchester
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.coode.owlapi.manchesterowlsyntax;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


/**
 * Author: Matthew Horridge
* The University Of Manchester
* Bio-Health Informatics Group
* Date: 19-May-2008

*/ @SuppressWarnings("javadoc") public class ManchesterOWLSyntaxTokenizer { public static final String EOF = "|EOF|"; protected Set skip = new HashSet(); protected Set commentDelimiters = new HashSet(); protected Set delims = new HashSet(); private String buffer; private int pos; private int col; private int row; int startPos = 0; int startCol = 1; int startRow = 1; List tokens = new ArrayList(); private StringBuilder sb; private static final char ESCAPE_CHAR = '\\'; public ManchesterOWLSyntaxTokenizer(String buffer) { this.buffer = buffer; skip.add(' '); skip.add('\n'); skip.add('\r'); skip.add('\t'); commentDelimiters.add('#'); commentDelimiters.add('*'); delims.add('('); delims.add(')'); delims.add('['); delims.add(']'); delims.add(','); delims.add('{'); delims.add('}'); delims.add('^'); delims.add('@'); delims.add('<'); delims.add('>'); delims.add('='); delims.add('?'); } private void reset() { sb = new StringBuilder(); tokens.clear(); startRow = 1; startCol = 1; startPos = 0; pos = 0; row = 1; col = 1; } public List tokenize() { reset(); int bufferLen = buffer.length(); char lastChar = ' '; while (pos < bufferLen) { char ch = readChar(); if (ch == ESCAPE_CHAR) { lastChar = ch; ch = readChar(); } if (ch == '\"' && lastChar != '\\') { readString('\"', true); } else if (ch == '\'' && lastChar != '\\') { readString('\'', true); } else if (ch == '<') { // Potentially the start of an IRI readIRI(); } else if (skip.contains(ch)) { consumeToken(); } else if (commentDelimiters.contains(ch)) { consumeToken(); readComment(); } else if (delims.contains(ch)) { consumeToken(); sb.append(ch); if (ch != '@') { consumeToken(); } } else { sb.append(ch); } lastChar = ch; } consumeToken(); tokens.add(new Token(EOF, pos, col, row)); return new ArrayList(tokens); } private void consumeToken() { if (sb.length() > 0) { tokens.add(new Token(sb.toString(), startPos, startCol, startRow)); sb = new StringBuilder(); } startPos = pos; startCol = col; startRow = row; } private void readComment() { char ch = '#'; while(ch != '\n' && pos < buffer.length()) { ch = readChar(); } consumeToken(); } private void readString(char terminator, boolean appendTerminator) { if (appendTerminator) { sb.append(terminator); } while (pos < buffer.length()) { char ch = readChar(); if (ch == ESCAPE_CHAR) { int j = pos + 1; if (j < buffer.length()) { char escapedChar = readChar(); if (escapedChar == '\"' || escapedChar == '\'' || escapedChar == '\\') { sb.append(escapedChar); } else { sb.append(ch); sb.append(escapedChar); } } else { sb.append('\\'); } } else if (ch == terminator) { if (appendTerminator) { sb.append(ch); } break; } else { sb.append(ch); } } consumeToken(); } private void readIRI() { sb = new StringBuilder("<"); int startPos1 = pos; while (pos < buffer.length()) { char ch = readChar(); if(Character.isWhitespace(ch)) { // Not an IRI -- go back to where we started pos = startPos1; sb = new StringBuilder("<"); consumeToken(); break; } else if(ch == '>') { // End of IRI sb.append(">"); consumeToken(); break; } else { sb.append(ch); } } } private char readChar() { char ch = buffer.charAt(pos); pos++; col++; if (ch == '\n') { row++; col = 0; } return ch; } public static class Token { private String token; private int pos; private int col; private int row; public Token(String token, int pos, int col, int row) { this.token = token; this.pos = pos; this.col = col; this.row = row; } public String getToken() { return token; } public int getPos() { return pos; } public int getCol() { return col; } public int getRow() { return row; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(token); sb.append(" ["); sb.append(pos); sb.append(", "); sb.append(col); sb.append(", "); sb.append(row); sb.append("]"); return sb.toString(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy