org.semanticweb.yars.nx.parser.NxParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nxparser-parsers Show documentation
There is a newer version: 4.0.0
package org.semanticweb.yars.nx.parser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.logging.Logger;

import org.semanticweb.yars.nx.BNode;
import org.semanticweb.yars.nx.Literal;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.Resource;
import org.semanticweb.yars.nx.Unbound;
import org.semanticweb.yars.nx.Variable;

/**
 * NxParser is a non-validating N1, N2, N3, N4, Nx parser.
 * 
 * Assumes the data to be formatted according to NTRIPLES spec, including it to
 * be encoded in ASCII with Unicode characters properly escaped as in the
 * NTRIPLES spec.
 * 
 * Deviates from NTRIPLES spec in two points known so far:
 * 
 * Only space delimits parts of a statement, not space or tab.
 * Previous to the full stop at the end of a statement, there must be a
 * space.
 * 
 * 
 * 
 * @link http://www.w3.org/TR/rdf-testcases/#ntriples
 * @version 1.2
 * 
 * @author Aidan Hogan
 * @author Andreas Harth
 * @author Tobias Kaefer
 */
public class NxParser implements Iterator, Iterable {

	private static Logger _log = Logger.getLogger(NxParser.class.getName());

	private int _lineNo = 0;
	private String _line = null;
	private Iterator _stringIt = null;
	private Node[] next = null;

	public Iterator parse(Reader r) {
		return parse(new BufferedReader(r));
	}

	public Iterator parse(InputStream is, Charset cs) {
		return parse(new BufferedReader(new InputStreamReader(is, cs)));
	}

	public Iterator parse(InputStream is) {
		return parse(new BufferedReader(new InputStreamReader(is)));
	}

	public Iterator parse(BufferedReader br) {
		return parse(stringItFromBufferedReader(br));
	}

	public Iterator parse(Iterable iterable) {
		return parse(iterable.iterator());
	}

	public Iterator parse(Iterator iterator) {
		_stringIt = iterator;
		loadNext();
		return this;
	}

	public boolean hasNext() {
		return next != null;
	}

	public Node[] next(){
		if(next==null)
			throw new NoSuchElementException();
		Node[] now = next;
		loadNext();
		return now;
	}
	
	private void loadNext() {
		next = null;
		do{
			if (_stringIt.hasNext()){
				_line = _stringIt.next();
			} else {
				next = null;
				return;
			}
			++_lineNo;
		} while(_line==null || isEntirelyWhitespaceOrEmpty(_line));
		//iterate until we get a non-blank line
		
		try {
			next = parseNodesInternal(_line);
		} catch (Exception e) {
			_log.warning("Moving on to the next line, as I couldn't parse line "
					+ _lineNo + ": " + _line);
			e.printStackTrace();
			//invalid: skip and try again
			loadNext();
		}
		
		if (next == null)
			return;
		
		if(next.length==0)//valid but empty: skip and try again
			loadNext();
	}
	
	private static boolean isEntirelyWhitespaceOrEmpty(String s){
		for(char c:s.toCharArray()){
			if (!Character.isWhitespace(c))
				return false;
		}
		return true;
	}

	/**
	 * Calls remove from underlying string iterator.
	 */
	@Override
	public void remove() {
		_stringIt.remove();
	}

	@Override
	public Iterator iterator() {
		return this;
	}

	public static Node[] parseNodes(final String line) throws ParseException {
		try {
			return parseNodesInternal(line);
		} catch (Exception e) {
			throw new ParseException(e);
		}
	}

	/**
	 * Parses line and returns a Node[] contained within that line. May return
	 * an empty Node[] if the line is valid N-Triples but contains no nodes
	 * (e.g., a blank or comment line).
	 * 
	 * @param line
	 * @return A {@link Node} array with the RDF terms found in the line. Can be
	 *         of zero length.
	 * @throws ParseException
	 */
	protected static Node[] parseNodesInternal(final String line) throws ParseException {
		int startIndex = 0;
		int endIndex = 0;
		List nx = new LinkedList();
		
		if(line.isEmpty()) return new Node[0];

		//instead of checking for individual IndexOutOfBoundExceptions,
		//they are allowed to be thrown and caught in parseNodes()
		
		while (true) {
			while (Character.isWhitespace(line.charAt(startIndex))) {
				// skipping spaces
				++startIndex;
				++endIndex;
				
				if(startIndex==line.length()){
					if(nx.isEmpty()){
						return new Node[0];
					} else{
						throw new ParseException("Could not find closing '.' bracket for line "+line);
					}
				}
			}

			if (line.charAt(startIndex) == '<') {
				// resource.
				endIndex = startIndex;

				while (line.charAt(endIndex) != '>'
						&& (line.charAt(endIndex + 1) != '.'
						 || !Character.isWhitespace(line.charAt(endIndex + 1))))			
					++endIndex;
				++endIndex;

				if(endIndex==0) throw new ParseException("Could not find closing '>' bracket for resource starting at char "+startIndex+" while parsing line "+line);
				nx.add(new Resource(line.substring(startIndex, endIndex), true));
			} else if (line.charAt(startIndex) == '_') {
				// bnode.
				endIndex = startIndex;
				while (!((line.charAt(endIndex) == '.' && (endIndex + 2 >= line
						.length() || Character.isWhitespace(endIndex + 1))) || Character
						.isWhitespace(line.charAt(endIndex)))) {
					// (fullstop at endIndex and (at endIndex+1, whitespace or
					// line end )) OR whitespace at endIndex ends the thing.
					++endIndex;
				}
				nx.add(new BNode(line.substring(startIndex, endIndex), true));
			} else if (line.charAt(startIndex) == '.') {
				// statement's end.
				if(nx.isEmpty()){
					throw new ParseException("Exception at position " + startIndex+ " while parsing: '" + line +"'");
				}
				break;
			} else if (line.charAt(startIndex) == '"') {
				// literal.
				// telling escaped quotes within the literal from
				// literal-delimiting ones:
				endIndex = startIndex;
				do {
					endIndex = line.indexOf('\"', endIndex + 1);
				} while (line.charAt(endIndex - 1) == '\\'
						&& (((endIndex - 1 - onlyCharUntil(line, '\\',
								endIndex - 1)) % 2) == 0));
				// ^^ if the number of backslashes in front of a quote is even,
				// the found quote is the literal-delimiting one.
				while (!((line.charAt(endIndex) == '.' && (endIndex + 2 >= line
						.length() || Character.isWhitespace(endIndex + 1))) || Character
						.isWhitespace(line.charAt(endIndex)))) {
					// (fullstop at endIndex and (at endIndex+1, whitespace or
					// line end )) OR whitespace at endIndex ends the thing.
					++endIndex;
				}
				nx.add(new Literal(line.substring(startIndex, endIndex), true));
			} else if(line.charAt(startIndex) == '#' && nx.isEmpty()){
				// comment line.
				return new Node[0];
			} else if (line.charAt(startIndex) == '?') {
				// variable.
				endIndex = startIndex;
				while (!((line.charAt(endIndex) == '.' && (endIndex + 2 >= line
						.length() || Character.isWhitespace(endIndex + 1))) || Character
						.isWhitespace(line.charAt(endIndex)))) {
					// (fullstop at endIndex and (at endIndex+1, whitespace or
					// line end )) OR whitespace at endIndex ends the thing.
					++endIndex;
				}
				nx.add(new Variable(line.substring(startIndex, endIndex), true));
			} else if (line.charAt(startIndex) == Unbound.TO_STRING.charAt(0)) {
				// unbound.
				if (line.substring(startIndex,
						startIndex + Unbound.TO_STRING.length()).equals(
						Unbound.TO_STRING)) {
					nx.add(new Unbound());
					endIndex = startIndex + Unbound.TO_STRING.length();
					if (endIndex >= line.length()
							|| line.charAt(endIndex) != ' ') {
						throw new ParseException("Exception at position "
								+ startIndex + " while parsing: '" + line + "'");
					}
				} else {
					throw new ParseException("Exception at position " + endIndex+ " while parsing: '" + line +"'");
				}
			} else{
				throw new ParseException("Exception at position " + endIndex+ " while parsing: '" + line +"'");
			}
			if (line.charAt(endIndex) == '.')
				break;
			else
				startIndex = endIndex + 1;
		}
		return nx.toArray(new Node[nx.size()]);
	}

	/**
	 * Looks from a given position i in a string line backwards and returns the
	 * index of the last occurence of parameter c in a row.
	 * 
	 * @param line
	 *            the string
	 * @param c
	 *            the character
	 * @param i
	 *            the starting index
	 * @return the index to be returned
	 */
	private static int onlyCharUntil(String line, char c, int i) {
		while (line.charAt(i) == c) {
			--i;
		}
		return i + 1;
	}

	/**
	 * Creates an iterator of strings from a given buffered reader.
	 * 
	 * @param br
	 *            the buffered reader
	 * @return the iterator
	 */
	private static Iterator stringItFromBufferedReader(
			final BufferedReader br) {
		return new Iterator() {

			boolean nextIsFetched = true;
			String next = null;

			@Override
			public boolean hasNext() {
				if (!nextIsFetched) {
					return next == null ? false : true;
				} else {
					try {
						next = br.readLine();
					} catch (IOException e) {
						e.printStackTrace();
						next = null;
					}
					nextIsFetched = false;
					return next == null ? false : true;
				}
			}

			@Override
			public String next() {
				if (nextIsFetched)
					try {
						next = br.readLine();
					} catch (IOException e) {
						e.printStackTrace();
					}
				nextIsFetched = true;
				if (next == null)
					throw new NoSuchElementException();
				return next;
			}

			@Override
			public void remove() {
				throw new UnsupportedOperationException();
			}

		};
	}

	/*
	 * 
	 * FROM HERE DOWNWARDS: LEGACY STUFF
	 */

	/**
	 * Returns the line number that has currently been processed.
	 * 
	 * @return the line number
	 */
	public int lineNumber() {
		return _lineNo;
	}
}