org.apache.jena.riot.tokens.Token Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jena-arq Show documentation
ARQ is a SPARQL 1.1 query engine for Apache Jena
There is a newer version: 5.1.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.riot.tokens;

import static org.apache.jena.atlas.lib.Chars.* ;
import static org.apache.jena.atlas.lib.Lib.hashCodeObject ;
import static org.apache.jena.riot.tokens.TokenType.* ;

import java.util.ArrayList ;
import java.util.List ;
import java.util.Objects ;

import org.apache.jena.atlas.io.PeekReader ;
import org.apache.jena.atlas.iterator.Iter ;
import org.apache.jena.atlas.lib.Pair ;
import org.apache.jena.datatypes.RDFDatatype ;
import org.apache.jena.datatypes.TypeMapper ;
import org.apache.jena.datatypes.xsd.XSDDatatype ;
import org.apache.jena.graph.Node ;
import org.apache.jena.graph.NodeFactory ;
import org.apache.jena.riot.RiotException ;
import org.apache.jena.riot.system.PrefixMap ;
import org.apache.jena.riot.system.Prologue ;
import org.apache.jena.sparql.core.Var ;
import org.apache.jena.sparql.graph.NodeConst ;
import org.apache.jena.sparql.util.FmtUtils ;
import org.apache.jena.sparql.util.NodeUtils ;

public final class Token
{
    // Some tokens are "multipart"
    //   A language tag is a sub-token string and token part.
    //     It uses subToken1, and image2.
    //   A datatype literal is two tokens
    //     It uses subToken1, subToken2 and sets image to the lexical part.
    //   A prefixed name is two strings. 
    //     It uses tokenImage and tokenImage2
    
    private TokenType tokenType = null ;
    
    private String tokenImage = null ;
    private String tokenImage2 = null ;         // Used for language tag and second part of prefix name
    
    private Token subToken1 = null ;            // A related token (used for datatype literals and language tags)
    private Token subToken2 = null ;            // A related token (used for datatype literals and language tags)
    private StringType stringType = null ;
    
    public int cntrlCode = 0 ;
    private long column ;
    private long line ;
    
    // Keywords recognized.
    public static final String ImageANY     = "ANY" ;
    public static final String ImageTrue    = "true" ;
    public static final String ImageFalse   = "false" ;
    
    public final TokenType getType()        { return tokenType ; }
    public final String getImage()          { return tokenImage ; }
    //public final String getImage1()  { return tokenImage1 ; }
    
    public final String getImage2()         { return tokenImage2 ; }
    public final int getCntrlCode()         { return cntrlCode ; }
    public final Token getSubToken1()       { return subToken1 ; }
    public final Token getSubToken2()       { return subToken2 ; }
    public final StringType getStringType() { return stringType ; }
    public final boolean hasStringType(StringType st)   { return this.stringType == st ;}
    public final boolean isLongString() {
        switch(stringType) {
            case LONG_STRING1:
            case LONG_STRING2:
                return true;
                default:
                    return false;
        }        
    }
    
    public final Token setType(TokenType tokenType)     { this.tokenType = tokenType ; return this ; }
    public final Token setImage(String tokenImage)      { this.tokenImage = tokenImage ; return this ; }
    public final Token setImage(char tokenImage)        { this.tokenImage = String.valueOf(tokenImage) ; return this ; }
    
    public final Token setImage2(String tokenImage2)    { this.tokenImage2 = tokenImage2 ; return this ; }
    
    public final Token setCntrlCode(int cntrlCode)      { this.cntrlCode = cntrlCode ; return this ; }

    public final Token setSubToken1(Token subToken)     { this.subToken1 = subToken ; return this ; }
    public final Token setSubToken2(Token subToken)     { this.subToken2 = subToken ; return this ; }
    
    public final Token setStringType(StringType st)     { this.stringType = st ; return this ; }
    
    static Token create(String s)
    {
        PeekReader pr = PeekReader.readString(s) ;
        TokenizerText tt = new TokenizerText(pr) ;
        if ( ! tt.hasNext() )
            throw new RiotException("No token") ;
        Token t = tt.next() ;
        if ( tt.hasNext() )
            throw new RiotException("Extraneous charcaters") ;
        return t ;
    }

    static Iter createN(String s)
    {
        PeekReader pr = PeekReader.readString(s) ;
        TokenizerText tt = new TokenizerText(pr) ;
        List x = new ArrayList<>() ;
        while(tt.hasNext())
            x.add(tt.next()) ;
        return Iter.iter(x) ;
    }
    
    public long getColumn()
    {
        return column ;
    }

    public long getLine()
    {
        return line ;
    }

    public Token(String string) { this(STRING, string) ; } 

    public Token(TokenType type) { this(type, null, null) ; }

    public Token(TokenType type, String image1) { this(type, image1, null) ; }

    public Token(TokenType type, String image1, String image2) { 
        this() ;
        setType(type) ;
        setImage(image1) ;
        setImage2(image2) ;
    }
    
//    private Token(TokenType type) { this(type, null, null, null) ; }
//    
//    private Token(TokenType type, String image1) { this(type, image1, null, null) ; }
//    
//    private Token(TokenType type, String image1, String image2)
//    { this(type, image1, image2, null) ; }
//
//    private Token(TokenType type, String image1, Token subToken)
//    { this(type, image1, null, subToken) ; }
//
//
    private Token(TokenType type, String image1, String image2, Token subToken1, Token subToken2)
    {
        this() ;
        setType(type) ;
        setImage(image1) ;
        setImage2(image2) ;
        setSubToken1(subToken1) ;
        setSubToken2(subToken2) ;
    }
    
    private Token() { this(-1, -1) ; }
    
    public Token(long line, long column) { this.line = line ; this.column = column ; }
    
    public Token(Token token)
    { 
        this(token.tokenType, 
             token.tokenImage, token.tokenImage2,
             token.subToken1, token.subToken2) ;
        this.cntrlCode      = token.cntrlCode ;
        this.line           = token.line ; 
        this.column         = token.column ;
    }
    
    // Convenience operations for accessing tokens. 
    
    public String asString() {
        switch (tokenType)
        {
            case STRING: 
                return getImage() ;
            default:
                return null ;
        }
    }
    
    public int asInt() {
        if ( ! hasType(TokenType.INTEGER) ) return -1 ;
        return Integer.valueOf(tokenImage);
    }
    
    public long asLong()
    {
        return asLong(-1) ;
    }
    
    public long asLong(long dft)
    {
        switch (tokenType)
        {
            case INTEGER:   return Long.valueOf(tokenImage) ;
            case HEX:       return Long.valueOf(tokenImage, 16) ;
            default:
                 return dft ;
        }
    }
    
    public String asWord()
    {
        if ( ! hasType(TokenType.KEYWORD) ) return null ;
        return tokenImage ; 
    }
    
    public String text()
    {
        return toString(false) ;
        
    }
    
    @Override
    public String toString()
    {
        return toString(false) ;
    }
     
    static final String delim1 = "" ;
    static final String delim2 = "" ;
    public String toString(boolean addLocation)
    {
        StringBuilder sb = new StringBuilder() ;
        if ( addLocation && getLine() >= 0 && getColumn() >= 0 )
            sb.append(String.format("[%d,%d]", getLine(), getColumn())) ;
        sb.append("[") ;
        if ( getType() == null )
            sb.append("null") ;
        else
            sb.append(getType().toString()) ;
        
        if ( getImage() != null )
        {
            sb.append(":") ;
            sb.append(delim1) ;
            sb.append(getImage()) ;
            sb.append(delim1) ;
        }
            
        if ( getImage2() != null )
        {
            sb.append(":") ;
            sb.append(delim2) ;
            sb.append(getImage2()) ;
            sb.append(delim2) ;
        }
        
        if ( getSubToken1() != null )
        {
            sb.append(";") ;
            sb.append(delim2) ;
            sb.append(getSubToken1().toString()) ;
            sb.append(delim2) ;
        }   

        if ( getSubToken2() != null )
        {
            sb.append(";") ;
            sb.append(delim2) ;
            sb.append(getSubToken2().toString()) ;
            sb.append(delim2) ;
        }   

        if ( getCntrlCode() != 0 )
        {
            sb.append(":") ; 
            sb.append(getCntrlCode()) ;
        }
        sb.append("]") ;
        return sb.toString() ;
    }
    
    public boolean isEOF()      { return tokenType == TokenType.EOF ; }
    
    public boolean isWord()     { return tokenType == TokenType.KEYWORD ; }

    public boolean isString()   { return tokenType == TokenType.STRING ; }

    public boolean isNumber()
    {
        switch(tokenType)
        {
            case DECIMAL: 
            case DOUBLE:
            case INTEGER:
                return true ;
            default:
                return false ;
        }
    }
    
    public boolean isNode()
    {
        switch(tokenType)
        {
            case BNODE :
            case IRI : 
            case PREFIXED_NAME :
            case DECIMAL: 
            case DOUBLE:
            case INTEGER:
            case LITERAL_DT:
            case LITERAL_LANG:
            case STRING:
                return true ;
            case KEYWORD:
                if ( tokenImage.equals(ImageANY) )
                    return true ;
                return false ;
            default:
                return false ;
        }
    }
    
    // N-Triples but allows single quoted strings as well.
    public boolean isNodeBasic()
    {
        switch(tokenType)
        {
            case BNODE :
            case IRI : 
            case PREFIXED_NAME :
            case LITERAL_DT:
            case LITERAL_LANG:
                return true;
            case STRING : {
                switch (stringType) {
                    case STRING1 :
                    case STRING2 :
                        return true;
                    default :
                        return false;
                }
            }
            default:
                return false ;
        }
    }
    
    public boolean isBasicLiteral()
    {
        switch(tokenType)
        {
            case LITERAL_DT:
            case LITERAL_LANG:
            case STRING:
                return true ;
            default:
                return false ;
        }
    }
    
    public boolean isInteger()
    {
        return tokenType.equals(TokenType.INTEGER) ;
    }
    
    public boolean isIRI()
    {
        return tokenType.equals(TokenType.IRI) || tokenType.equals(TokenType.PREFIXED_NAME);
    }

    public boolean isBNode()
    {
        return tokenType.equals(TokenType.BNODE) ;
    }

    
    /** Token to Node, a very direct form that is purely driven off the token.
     *  Turtle and N-triples need to process the token and not call this:
     *  1/ Use bNode label as given
     *  2/ No prefix or URI resolution.
     *  3/ No checking.
     */
    public Node asNode()
    {
        return asNode(null) ;
    }
    
    /** Token to Node, with a prefix map
     *  Turtle and N-triples need to process the token and not call this:
     *  1/ Use bNode label as given
     *  2/ No prefix or URI resolution.
     *  3/ No checking.
     */
    public Node asNode(PrefixMap pmap)
    {
        switch(tokenType)
        {
            // Assumes that bnode labels have been sorted out already.
            case BNODE : return NodeFactory.createBlankNode(tokenImage) ;
            case IRI :
                // RiotLib.createIRIorBNode(tokenImage) includes processing <_:label>
                return NodeFactory.createURI(tokenImage) ; 
            case PREFIXED_NAME :
                if ( pmap == null )
                    return NodeFactory.createURI("urn:prefixed-name:"+tokenImage+":"+tokenImage2) ;
                String x = pmap.expand(tokenImage, tokenImage2) ;
                if ( x == null )
                    throw new RiotException("Can't expand prefixed name: "+this) ;
                return NodeFactory.createURI(x) ;
            case DECIMAL :  return NodeFactory.createLiteral(tokenImage, XSDDatatype.XSDdecimal)  ; 
            case DOUBLE :   return NodeFactory.createLiteral(tokenImage, XSDDatatype.XSDdouble)  ;
            case INTEGER:   return NodeFactory.createLiteral(tokenImage, XSDDatatype.XSDinteger) ;
            case LITERAL_DT :
            {
                Token lexToken = getSubToken1() ;
                Token dtToken  = getSubToken2() ;
                
                if ( pmap == null && dtToken.hasType(TokenType.PREFIXED_NAME) )
                    // Must be able to resolve the datattype else we can't find it's datatype.
                    throw new RiotException("Invalid token: "+this) ;
                Node n = dtToken.asNode(pmap);
                if ( ! n.isURI() )
                    throw new RiotException("Invalid token: "+this) ;
                RDFDatatype dt = TypeMapper.getInstance().getSafeTypeByName(n.getURI()) ;
                return NodeFactory.createLiteral(lexToken.getImage(), dt)  ;
            }
            case LITERAL_LANG : return NodeFactory.createLiteral(tokenImage, tokenImage2)  ;
            case STRING:
                return NodeFactory.createLiteral(tokenImage) ;
            case VAR:
                return Var.alloc(tokenImage) ;
            case KEYWORD:
                if ( tokenImage.equals(ImageANY) )
                    return NodeConst.nodeANY ;
                if ( tokenImage.equals(ImageTrue) )
                    return NodeConst.nodeTrue ;
                if ( tokenImage.equals(ImageFalse) )
                    return NodeConst.nodeFalse ;
                //$FALL-THROUGH$
            default: break ;
        }
        return null ;
    }

    
    public boolean hasType(TokenType tokenType)
    {
        return getType() == tokenType ;
    }
    
    @Override
    public int hashCode()
    {
        return hashCodeObject(tokenType) ^
                hashCodeObject(tokenImage) ^
                hashCodeObject(tokenImage2) ^
                hashCodeObject(cntrlCode) ;
    }
    
    @Override
    public boolean equals(Object other)
    {
        if ( ! ( other instanceof Token ) ) return false ;
        Token t = (Token)other ;
        return  Objects.equals(tokenType, t.tokenType) &&
        		Objects.equals(tokenImage, t.tokenImage) &&
        		Objects.equals(tokenImage2, t.tokenImage2) &&
        		Objects.equals(cntrlCode, t.cntrlCode) ;
    }
    
    public static Token tokenForChar(char character)
    {
        switch(character)
        { 
            case CH_DOT:        return new Token(TokenType.DOT) ;
            case CH_SEMICOLON:  return new Token(TokenType.SEMICOLON) ;
            case CH_COMMA:      return new Token(TokenType.COMMA) ;
            case CH_LBRACE:     return new Token(TokenType.LBRACE) ;
            case CH_RBRACE:     return new Token(TokenType.RBRACE) ;
            case CH_LPAREN:     return new Token(TokenType.LPAREN) ;
            case CH_RPAREN:     return new Token(TokenType.RPAREN) ;
            case CH_LBRACKET:   return new Token(TokenType.LBRACKET) ;
            case CH_RBRACKET:   return new Token(TokenType.RBRACKET) ;
            default:
                throw new RuntimeException("Token error: unrecognized character: "+character) ;
        }
    }
    
    public static Token tokenForInteger(long value)
    {
        return new Token(TokenType.INTEGER, Long.toString(value)) ;
    }
    
    public static Token tokenForWord(String word)
    {
        return new Token(TokenType.KEYWORD, word) ; 
    }

    public static Token tokenForNode(Node n)
    {
        return tokenForNode(n, null, null) ;
    }

    public static Token tokenForNode(Node n, Prologue prologue)
    {
        return tokenForNode(n, prologue.getBaseURI(), prologue.getPrefixMap()) ;
    }

    private static final String dtXSDintger = XSDDatatype.XSDinteger.getURI();
    private static final String dtXSDdecimal = XSDDatatype.XSDdecimal.getURI();
    private static final String dtXSDdouble = XSDDatatype.XSDdouble.getURI();
    
    public static Token tokenForNode(Node node, String base, PrefixMap mapping) {
        if ( node.isURI() ) {
            String uri = node.getURI() ;
            if ( mapping != null ) {
                Pair pname = mapping.abbrev(uri) ;
                if ( pname != null )
                    return new Token(TokenType.PREFIXED_NAME, pname.getLeft(), pname.getRight()) ;
            }
            if ( base != null ) {
                String x = FmtUtils.abbrevByBase(uri, base) ;
                if ( x != null )
                    return new Token(TokenType.IRI, x) ;
            }
            return new Token(IRI, node.getURI()) ;
        }

        if ( node.isBlank() )
            return new Token(BNODE, node.getBlankNodeLabel()) ;

        if ( node.isVariable() )
            return new Token(VAR, node.getName()) ;

        if ( node.isLiteral() ) {
            if ( NodeUtils.isSimpleString(node) ) {
                String lex = node.getLiteralLexicalForm() ;
                return new Token(STRING, lex) ;
            }

            if ( NodeUtils.isLangString(node) ) {
                String lex = node.getLiteralLexicalForm() ;
                Token sub1 = new Token(STRING, lex) ;
                String lang = node.getLiteralLanguage() ;
                return new Token(LITERAL_LANG, lex, lang, sub1, null) ;
            }

            // Has a datatype (RDF 1.0 and RDF 1.1)
            String datatype = node.getLiteralDatatypeURI() ;
            String s = node.getLiteralLexicalForm() ;

            // Special form we know how to handle?
            // Assume valid text
            if ( datatype.equals(dtXSDintger) ) {
                try {
                    String s1 = s ;
                    // BigInteger does not allow leading +
                    // so chop it off before the format test
                    // BigDecimal does allow a leading +
                    if ( s.startsWith("+") )
                        s1 = s.substring(1) ;
                    new java.math.BigInteger(s1) ;
                    return new Token(INTEGER, s) ;
                }
                catch (NumberFormatException nfe) {}
                // No luck. Continue.
                // Continuing is always safe.
            }

            if ( datatype.equals(dtXSDdecimal) ) {
                if ( s.indexOf('.') > 0 ) {
                    try {
                        // BigDecimal does allow a leading +
                        new java.math.BigDecimal(s) ;
                        return new Token(DECIMAL, s) ;
                    }
                    catch (NumberFormatException nfe) {}
                    // No luck. Continue.
                }
            }

            if ( datatype.equals(dtXSDdouble) ) {
                // Assumes SPARQL has decimals and doubles.
                // Must have 'e' or 'E' to be a double short form.

                if ( s.indexOf('e') >= 0 || s.indexOf('E') >= 0 ) {
                    try {
                        Double.parseDouble(s) ;
                        return new Token(DOUBLE, s) ;
                    }
                    catch (NumberFormatException nfe) {}
                    // No luck. Continue.
                }
            }

            // if ( datatype.equals(XSD.xboolean.getURI()) ) {
            //     if ( s.equalsIgnoreCase("true") ) return new Token(BOOLEAN, s) ;
            //     if ( s.equalsIgnoreCase("false") ) return new Token(BOOLEAN, s) ;
            // }

            Node dt = NodeFactory.createURI(datatype) ;
            Token subToken1 = new Token(STRING, s) ;
            Token subToken2 = tokenForNode(dt) ;
            Token t = new Token(LITERAL_DT, s) ;
            t.setSubToken1(subToken1) ;
            t.setSubToken2(subToken2) ;
            return t ;
        }

        if ( node.equals(Node.ANY) )
            return new Token(TokenType.KEYWORD, ImageANY) ;

        throw new IllegalArgumentException() ;
    }
}