All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.pdmodel.common.function.type4.Parser Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdmodel.common.function.type4;

/**
 * Parser for PDF Type 4 functions. This implements a small subset of the PostScript
 * language but is no full PostScript interpreter.
 *
 */
public final class Parser
{

    /** Used to indicate the parsers current state. */
    private enum State
    {
        NEWLINE, WHITESPACE, COMMENT, TOKEN
    }

    private Parser()
    {
        //nop
    }

    /**
     * Parses a Type 4 function and sends the syntactic elements to the given
     * syntax handler.
     * @param input the text source
     * @param handler the syntax handler
     */
    public static void parse(CharSequence input, SyntaxHandler handler)
    {
        Tokenizer tokenizer = new Tokenizer(input, handler);
        tokenizer.tokenize();
    }

    /**
     * This interface defines all possible syntactic elements of a Type 4 function.
     * It is called by the parser as the function is interpreted.
     */
    public interface SyntaxHandler
    {

        /**
         * Indicates that a new line starts.
         * @param text the new line character (CR, LF, CR/LF or FF)
         */
        void newLine(CharSequence text);

        /**
         * Called when whitespace characters are encountered.
         * @param text the whitespace text
         */
        void whitespace(CharSequence text);

        /**
         * Called when a token is encountered. No distinction between operators and values
         * is done here.
         * @param text the token text
         */
        void token(CharSequence text);

        /**
         * Called for a comment.
         * @param text the comment
         */
        void comment(CharSequence text);
    }

    /**
     * Abstract base class for a {@link SyntaxHandler}.
     */
    public abstract static class AbstractSyntaxHandler implements SyntaxHandler
    {

        /** {@inheritDoc} */
        @Override
        public void comment(CharSequence text)
        {
            //nop
        }

        /** {@inheritDoc} */
        @Override
        public void newLine(CharSequence text)
        {
            //nop
        }

        /** {@inheritDoc} */
        @Override
        public void whitespace(CharSequence text)
        {
            //nop
        }

    }

    /**
     * Tokenizer for Type 4 functions.
     */
    private static final class Tokenizer
    {

        private static final char NUL = '\u0000'; //NUL
        private static final char EOT = '\u0004'; //END OF TRANSMISSION
        private static final char TAB = '\u0009'; //TAB CHARACTER
        private static final char FF = '\u000C'; //FORM FEED
        private static final char CR = '\r'; //CARRIAGE RETURN
        private static final char LF = '\n'; //LINE FEED
        private static final char SPACE = '\u0020'; //SPACE

        private final CharSequence input;
        private int index;
        private final SyntaxHandler handler;
        private State state = State.WHITESPACE;
        private final StringBuilder buffer = new StringBuilder();

        private Tokenizer(CharSequence text, SyntaxHandler syntaxHandler)
        {
            this.input = text;
            this.handler = syntaxHandler;
        }

        private boolean hasMore()
        {
            return index < input.length();
        }

        private char currentChar()
        {
            return input.charAt(index);
        }

        private char nextChar()
        {
            index++;
            if (!hasMore())
            {
                return EOT;
            }
            else
            {
                return currentChar();
            }
        }

        private char peek()
        {
            if (index < input.length() - 1)
            {
                return input.charAt(index + 1);
            }
            else
            {
                return EOT;
            }
        }

        private State nextState()
        {
            char ch = currentChar();
            switch (ch)
            {
            case CR:
            case LF:
            case FF: //FF
                state = State.NEWLINE;
                break;
            case NUL:
            case TAB:
            case SPACE:
                state = State.WHITESPACE;
                break;
            case '%':
                state = State.COMMENT;
                break;
            default:
                state = State.TOKEN;
            }
            return state;
        }

        private void tokenize()
        {
            while (hasMore())
            {
                buffer.setLength(0);
                nextState();
                switch (state)
                {
                case NEWLINE:
                    scanNewLine();
                    break;
                case WHITESPACE:
                    scanWhitespace();
                    break;
                case COMMENT:
                    scanComment();
                    break;
                default:
                    scanToken();
                }
            }
        }

        private void scanNewLine()
        {
            assert state == State.NEWLINE;
            char ch = currentChar();
            buffer.append(ch);
            if (ch == CR && peek() == LF)
            {
                //CRLF is treated as one newline
                buffer.append(nextChar());
            }
            handler.newLine(buffer);
            nextChar();
        }

        private void scanWhitespace()
        {
            assert state == State.WHITESPACE;
            buffer.append(currentChar());
            loop:
            while (hasMore())
            {
                char ch = nextChar();
                switch (ch)
                {
                case NUL:
                case TAB:
                case SPACE:
                    buffer.append(ch);
                    break;
                default:
                    break loop;
                }
            }
            handler.whitespace(buffer);
        }

        private void scanComment()
        {
            assert state == State.COMMENT;
            buffer.append(currentChar());
            loop:
            while (hasMore())
            {
                char ch = nextChar();
                switch (ch)
                {
                case CR:
                case LF:
                case FF:
                    break loop;
                default:
                    buffer.append(ch);
                }
            }
            //EOF reached
            handler.comment(buffer);
        }

        private void scanToken()
        {
            assert state == State.TOKEN;
            char ch = currentChar();
            buffer.append(ch);
            switch (ch)
            {
            case '{':
            case '}':
                handler.token(buffer);
                nextChar();
                return;
            default:
                //continue
            }
            loop:
            while (hasMore())
            {
                ch = nextChar();
                switch (ch)
                {
                case NUL:
                case TAB:
                case SPACE:
                case CR:
                case LF:
                case FF:
                case EOT:
                case '{':
                case '}':
                    break loop;
                default:
                    buffer.append(ch);
                }
            }
            //EOF reached
            handler.token(buffer);
        }

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy