com.lowagie.text.pdf.parser.MarkedUpTextAssembler Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of openpdf Show documentation
There is a newer version: 1.2.2.1-jre17
/**
 * Copyright 2014 by Tizra Inc.
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 * 

 * The Original Code is 'iText, a free JAVA-PDF library'.
 * 

 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
 * 

 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 * 

 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 * 

 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 */
package com.lowagie.text.pdf.parser;

import com.lowagie.text.pdf.PdfReader;

import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.List;

/**
 * We'll get called on a variety of marked section content (perhaps including
 * the results of nested sections), and will assemble it into an order as we
 * can.
 *
 * @author dgd
 *
 */
public class MarkedUpTextAssembler implements TextAssembler {
    /**
     * our result may be partially processed already, in which case we'll just
     * add things to it, once ready.
     */
    List result = new ArrayList<>();
    private PdfReader reader;
    @Nullable
    private ParsedTextImpl inProgress = null;
    private int page;
    private int wordIdCounter = 1;
    private boolean usePdfMarkupElements = false;
    /**
     * as we get new content (final or not), we accumulate it until we reach the
     * end of a parsing unit
     *
     * Each parsing unit may have a tag name that should wrap its content
     */
    private List partialWords = new ArrayList<>();

    MarkedUpTextAssembler(PdfReader reader) {
        this.reader = reader;
    }

    MarkedUpTextAssembler(PdfReader reader, boolean usePdfMarkupElements) {
        this.reader = reader;
        this.usePdfMarkupElements = usePdfMarkupElements;
    }

    /**
     * Remember an unassembled chunk until we hit the end of this element, or we
     * hit an assembled chunk, and need to pull things together.
     *
     * @param unassembled
     *            chunk of text rendering instruction to contribute to final
     *            text
     */
    @Override
    public void process(ParsedText unassembled, String contextName) {
        partialWords.addAll(unassembled.getAsPartialWords());
    }

    /**
     * Slot fully-assembled chunk into our result at the current location. If
     * there are unassembled chunks waiting, assemble them first.
     *
     * @param completed
     *            This is a chunk from a nested element
     */
    @Override
    public void process(FinalText completed, String contextName) {
        clearAccumulator();
        result.add(completed);

    }

    /**
     * {@inheritDoc}
     * @see com.lowagie.text.pdf.parser.TextAssembler#process(Word, String)
     */
    @Override
    public void process(Word completed, String contextName) {
        partialWords.add(completed);
    }

    /**
     *
     */
    private void clearAccumulator() {
        for (TextAssemblyBuffer partialWord : partialWords) {
            // Visit each partialWord, calling renderText 
            partialWord.assemble(this);
        }
        partialWords.clear();
        if (inProgress != null) {
            result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements));
            inProgress = null;
        }
    }

    private FinalText concatenateResult(@Nullable String containingElementName) {
        // null element means that this is a formatting artifact, not content.
        if (containingElementName == null) {
            // at some point, we may want to extract alternate text for some
            // artifacts.
            return null;
        }
        StringBuilder res = new StringBuilder();
        if (usePdfMarkupElements && !containingElementName.isEmpty()) {
            res.append('<').append(containingElementName).append('>');
        }
        for (FinalText item : result) {
            res.append(item.getText());
        }
        // important, as the stuff buffered in the result is now used up!
        result.clear();
        if (usePdfMarkupElements && !containingElementName.isEmpty()) {
            res.append("= 0) {
                containingElementName = containingElementName.substring(0, spacePos);
            }
            res.append(containingElementName).append('>');
        }
        return new FinalText(res.toString());
    }

    /**
     * {@inheritDoc}
     * @see com.lowagie.text.pdf.parser.TextAssembler#endParsingContext(String)
     */
    @Override
    public FinalText endParsingContext(@Nullable String containingElementName) {
        clearAccumulator();
        return concatenateResult(containingElementName);
    }

    /**
     *
     * @see com.lowagie.text.pdf.parser.TextAssembler#reset()
     */
    @Override
    public void reset() {
        result.clear();
        partialWords.clear();
        inProgress = null;
    }

    @Override
    public void renderText(FinalText finalText) {
        result.add(finalText);
    }

    /**
     * Captures text using a simplified algorithm for inserting hard returns and
     * spaces
     *
     * @see com.lowagie.text.pdf.parser.GraphicsState
     * @see com.lowagie.text.pdf.parser.Matrix
     */
    @Override
    public void renderText(ParsedTextImpl partialWord) {
        boolean firstRender = inProgress == null;
        boolean hardReturn = false;
        if (firstRender) {
            inProgress = partialWord;
            return;
        }
        Vector start = partialWord.getStartPoint();
        Vector lastStart = inProgress.getStartPoint();
        Vector lastEnd = inProgress.getEndPoint();

        // see
        // http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
        float dist = inProgress.getBaseline()
                .subtract(lastStart)
                .cross(lastStart.subtract(start))
                .lengthSquared() / inProgress.getBaseline().subtract(lastStart).lengthSquared();

        float sameLineThreshold = partialWord.getAscent() * 0.5f;
        // let's try using 25% of current leading for vertical slop.
        if (dist > sameLineThreshold || Float.isNaN(dist)) {
            hardReturn = true;
        }
        /*
         * Note: Technically, we should check both the start and end positions,
         * in case the angle of the text changed without any displacement but
         * this sort of thing probably doesn't happen much in reality, so we'll
         * leave it alone for now
         */
        float spacing = lastEnd.subtract(start).length();
        if (hardReturn || partialWord.breakBefore()) {
            result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements));
            if (hardReturn) {
                result.add(new FinalText("\n"));
                if (usePdfMarkupElements) {
                    result.add(new FinalText("
"));
                }
            }
            inProgress = partialWord;
            // System.out.println("<< Hard Return >>");
        } else if (spacing < partialWord.getSingleSpaceWidth() / 2.3 || inProgress.shouldNotSplit()) {
            inProgress = new Word(inProgress.getText() + partialWord.getText().trim(),
                    partialWord.getAscent(),
                    partialWord.getDescent(),
                    lastStart,
                    partialWord.getEndPoint(),
                    inProgress.getBaseline(),
                    partialWord.getSingleSpaceWidth(),
                    inProgress.shouldNotSplit(),
                    inProgress.breakBefore());
        } else {
            result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements));
            inProgress = partialWord;
        }
    }

    /**
     * Getter.
     *
     * @return reader
     */
    protected PdfReader getReader() {
        return reader;
    }

    /**
     * {@inheritDoc}
     * @see com.lowagie.text.pdf.parser.TextAssembler#setPage(int)
     */
    @Override
    public void setPage(int page) {
        this.page = page;
    }

    /**
     * {@inheritDoc}
     * @see com.lowagie.text.pdf.parser.TextAssembler#getWordId()
     */
    @Override
    public String getWordId() {
        return "word" + wordIdCounter++;
    }

}