All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.pdf.parser.PdfTokenizer Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.pdf.parser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;

/**
 * Splits {@link PDPage} tokens into {@link PdfToken}.
 *
 */
public class PdfTokenizer {

    /** {@link List} of {@link PdfToken}. */
	private List tokens;

	/**
	 * constructor.
	 * @param page {@link PDPage}
	 */
	public PdfTokenizer(final PDPage page) {

	    try {
	        List pageObjects = parsePageTokens(page);
	        this.tokens = buildPdfTokens(pageObjects);
	    } catch (IOException e) {
	        // TODO change to throw a PdfTokenizerException
	        throw new RuntimeException(e);
	    }
	}

	/**
	 * Build {@link PdfToken} from {@link Object}.
	 * @param pageObjects {@link List} of {@link Object}
	 * @return {@link List} of {@link PdfToken}
	 */
    private List buildPdfTokens(final List pageObjects) {

		List list = new ArrayList<>();

		for (int i = 0; i < pageObjects.size(); i++) {

			int j = 1;
			Object object = pageObjects.get(i);

			if (object instanceof Operator) {

				Operator op = (Operator) object;
				List operands = new ArrayList<>();

				PdfToken token = new PdfToken(new PdfTokenOperator(op));
				list.add(token);
				token.setOperands(operands);

                while (j <= i
                        && !(pageObjects.get(i - j) instanceof Operator)) {
					Object obj = pageObjects.get(i - j);
					operands.add(0, new PdfTokenOperand(obj));
					j++;
				}
			}
		}

		return list;
	}

    /**
     * Transforms {@link PDPage} tokens into a {@link List} of {@link Object}.
     * @param page {@link PDPage}
     * @return {@link List}of {@link Object}
     * @throws IOException IOException
     */
    private List parsePageTokens(final PDPage page) throws IOException {

		PDFStreamParser parser = new PDFStreamParser(page);
		Object token = parser.parseNextToken();

		List objects = new ArrayList<>();

		while (token != null) {
			objects.add(token);
            token = parser.parseNextToken();
		}

		return objects;
	}

    /**
     * @return {@link List} of {@link PdfToken}
     */
	public List getTokens() {
		return this.tokens;
	}
}