org.apache.pdfbox.pdmodel.common.function.type4.Parser Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdmodel.common.function.type4;

/**
 * Parser for PDF Type 4 functions. This implements a small subset of the PostScript
 * language but is no full PostScript interpreter.
 *
 */
public final class Parser
{

    /** Used to indicate the parsers current state. */
    private enum State
    {
        NEWLINE, WHITESPACE, COMMENT, TOKEN
    }

    private Parser()
    {
        //nop
    }

    /**
     * Parses a Type 4 function and sends the syntactic elements to the given
     * syntax handler.
     * @param input the text source
     * @param handler the syntax handler
     */
    public static void parse(CharSequence input, SyntaxHandler handler)
    {
        Tokenizer tokenizer = new Tokenizer(input, handler);
        tokenizer.tokenize();
    }

    /**
     * This interface defines all possible syntactic elements of a Type 4 function.
     * It is called by the parser as the function is interpreted.
     */
    public interface SyntaxHandler
    {

        /**
         * Indicates that a new line starts.
         * @param text the new line character (CR, LF, CR/LF or FF)
         */
        void newLine(CharSequence text);

        /**
         * Called when whitespace characters are encountered.
         * @param text the whitespace text
         */
        void whitespace(CharSequence text);

        /**
         * Called when a token is encountered. No distinction between operators and values
         * is done here.
         * @param text the token text
         */
        void token(CharSequence text);

        /**
         * Called for a comment.
         * @param text the comment
         */
        void comment(CharSequence text);
    }

    /**
     * Abstract base class for a {@link SyntaxHandler}.
     */
    public abstract static class AbstractSyntaxHandler implements SyntaxHandler
    {

        /** {@inheritDoc} */
        @Override
        public void comment(CharSequence text)
        {
            //nop
        }

        /** {@inheritDoc} */
        @Override
        public void newLine(CharSequence text)
        {
            //nop
        }

        /** {@inheritDoc} */
        @Override
        public void whitespace(CharSequence text)
        {
            //nop
        }

    }

    /**
     * Tokenizer for Type 4 functions.
     */
    private static final class Tokenizer
    {

        private static final char NUL = '\u0000'; //NUL
        private static final char EOT = '\u0004'; //END OF TRANSMISSION
        private static final char TAB = '\u0009'; //TAB CHARACTER
        private static final char FF = '\u000C'; //FORM FEED
        private static final char CR = '\r'; //CARRIAGE RETURN
        private static final char LF = '\n'; //LINE FEED
        private static final char SPACE = '\u0020'; //SPACE

        private final CharSequence input;
        private int index;
        private final SyntaxHandler handler;
        private State state = State.WHITESPACE;
        private final StringBuilder buffer = new StringBuilder();

        private Tokenizer(CharSequence text, SyntaxHandler syntaxHandler)
        {
            this.input = text;
            this.handler = syntaxHandler;
        }

        private boolean hasMore()
        {
            return index < input.length();
        }

        private char currentChar()
        {
            return input.charAt(index);
        }

        private char nextChar()
        {
            index++;
            if (!hasMore())
            {
                return EOT;
            }
            else
            {
                return currentChar();
            }
        }

        private char peek()
        {
            if (index < input.length() - 1)
            {
                return input.charAt(index + 1);
            }
            else
            {
                return EOT;
            }
        }

        private State nextState()
        {
            char ch = currentChar();
            switch (ch)
            {
            case CR:
            case LF:
            case FF: //FF
                state = State.NEWLINE;
                break;
            case NUL:
            case TAB:
            case SPACE:
                state = State.WHITESPACE;
                break;
            case '%':
                state = State.COMMENT;
                break;
            default:
                state = State.TOKEN;
            }
            return state;
        }

        private void tokenize()
        {
            while (hasMore())
            {
                buffer.setLength(0);
                nextState();
                switch (state)
                {
                case NEWLINE:
                    scanNewLine();
                    break;
                case WHITESPACE:
                    scanWhitespace();
                    break;
                case COMMENT:
                    scanComment();
                    break;
                default:
                    scanToken();
                }
            }
        }

        private void scanNewLine()
        {
            assert state == State.NEWLINE;
            char ch = currentChar();
            buffer.append(ch);
            if (ch == CR && peek() == LF)
            {
                //CRLF is treated as one newline
                buffer.append(nextChar());
            }
            handler.newLine(buffer);
            nextChar();
        }

        private void scanWhitespace()
        {
            assert state == State.WHITESPACE;
            buffer.append(currentChar());
            loop:
            while (hasMore())
            {
                char ch = nextChar();
                switch (ch)
                {
                case NUL:
                case TAB:
                case SPACE:
                    buffer.append(ch);
                    break;
                default:
                    break loop;
                }
            }
            handler.whitespace(buffer);
        }

        private void scanComment()
        {
            assert state == State.COMMENT;
            buffer.append(currentChar());
            loop:
            while (hasMore())
            {
                char ch = nextChar();
                switch (ch)
                {
                case CR:
                case LF:
                case FF:
                    break loop;
                default:
                    buffer.append(ch);
                }
            }
            //EOF reached
            handler.comment(buffer);
        }

        private void scanToken()
        {
            assert state == State.TOKEN;
            char ch = currentChar();
            buffer.append(ch);
            switch (ch)
            {
            case '{':
            case '}':
                handler.token(buffer);
                nextChar();
                return;
            default:
                //continue
            }
            loop:
            while (hasMore())
            {
                ch = nextChar();
                switch (ch)
                {
                case NUL:
                case TAB:
                case SPACE:
                case CR:
                case LF:
                case FF:
                case EOT:
                case '{':
                case '}':
                    break loop;
                default:
                    buffer.append(ch);
                }
            }
            //EOF reached
            handler.token(buffer);
        }

    }

}