com.lowagie.text.pdf.parser.Word Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openpdf
There is a newer version: 2.0.4
/**
 * Copyright 2014 by Tizra Inc.
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * dgd: com.lowagie.text.pdf.parser
 */
package com.lowagie.text.pdf.parser;

import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.PdfReader;

/**
 * @author dgd
 */
public class Word extends ParsedTextImpl {
    
    /**
     * Is this an indivisible fragment, because it contained a space or was split from a space-
     * containing string. Non-splittable words can be merged (into new non-splittable words).
     */
    boolean shouldNotSplit;
    /**
     * If this word or fragmant was preceded by a space, or a line break, it should never be merged
     * into a preceding word.
     */
    boolean breakBefore;

    /**
     * @param text text content
     * @param ascent font ascent (e.g. height)
     * @param descent How far below the baseline letters go
     * @param startPoint first point of the text
     * @param endPoint ending offset of text
     * @param baseline line along which text is set.
     * @param spaceWidth how much space is a space supposed to take.
     * @param isCompleteWord word should never be split
     * @param breakBefore word starts here, should never combine to the left.
     */
    Word(String text, float ascent, float descent, Vector startPoint,
            Vector endPoint, Vector baseline, float spaceWidth, boolean isCompleteWord, boolean breakBefore) {
        super(text, startPoint, endPoint, baseline, ascent, descent, spaceWidth);
        shouldNotSplit = isCompleteWord;
        this.breakBefore = breakBefore;
    }

    /**
     * accept a visitor that is assembling text
     * 
     * @param p the assembler that is visiting us.
     * @param contextName What is the wrapping markup element name if any
     * @see com.lowagie.text.pdf.parser.ParsedTextImpl#accumulate(com.lowagie.text.pdf.parser.TextAssembler, String)
     * @see com.lowagie.text.pdf.parser.TextAssemblyBuffer#accumulate(com.lowagie.text.pdf.parser.TextAssembler, String)
     */
    @Override
    public void accumulate(TextAssembler p, String contextName) {
        p.process(this, contextName);
    }

    /**
     * Accept a visitor that is assembling text
     * @param p the assembler that is visiting us.
     * @see com.lowagie.text.pdf.parser.TextAssemblyBuffer#assemble(com.lowagie.text.pdf.parser.TextAssembler)
     * @see com.lowagie.text.pdf.parser.ParsedTextImpl#assemble(com.lowagie.text.pdf.parser.TextAssembler)
     */
    @Override
    public void assemble(TextAssembler p) {
        p.renderText(this);
    }

    private static String formatPercent(float f) {
        return String.format("%.2f%%", f);
    }

    /**
     * Generate markup for this word. send the assembler a strings representing
     * a CSS style that will format us nicely.
     *
     * @param text
     *            passed in because we may have wanted to alter it, e.g. by
     *            trimming white space, or filtering characters or something.
     * @param reader
     *            the file reader from which we are extracting
     * @param page
     *            number of the page we are reading text from
     * @param assembler
     *            object to assemble text from fragments and larger strings on a
     *            page.
     * @return markup to represent this one word.
     */
    private String wordMarkup(String text, PdfReader reader, int page,
            TextAssembler assembler) {

        Rectangle mediaBox = reader.getPageSize(page);
        Rectangle cropBox = reader.getBoxSize(page, "crop");
        text = text.replaceAll("[\u00A0\u202f]", " ").trim();
        if (text.length() == 0)
            return text;
        mediaBox.normalize();
        if (cropBox != null) {
            cropBox.normalize(); 
        } else {
            cropBox = reader.getBoxSize(page, "trim");
            if (cropBox != null) {
                cropBox.normalize();
            } else {
                cropBox = mediaBox;
            }
        }
        float xOffset = cropBox.getLeft() - mediaBox.getLeft();
        float yOffset = cropBox.getTop() - mediaBox.getTop();
        Vector startPoint = getStartPoint();
        Vector endPoint = getEndPoint();
        float pageWidth = cropBox.getWidth();
        float pageHeight = cropBox.getHeight();
        float leftPercent = (float) ((startPoint.get(0) - xOffset  - mediaBox.getLeft()) / pageWidth
                * 100.0);
        float bottom = endPoint.get(1) + yOffset - getDescent() - mediaBox.getBottom();
        float bottomPercent =  bottom / pageHeight * 100f;
        StringBuilder result = new StringBuilder();
        float width = getWidth();
        float widthPercent = width / pageWidth * 100.0f;

        float height = getAscent();
        float heightPercent = height / pageHeight * 100.0f;
        String myId = assembler.getWordId();
        Rectangle resultRect = new Rectangle(leftPercent, bottomPercent, leftPercent+widthPercent, bottomPercent+heightPercent);
        result.append("")
                .append(escapeHTML(text)).append(" ");
        result.append(" ");

        return result.toString();
    }

    private static String escapeHTML(String s) {
        return s.replaceAll("&", "&").replaceAll("<", "<")
                .replaceAll(">", ">");
    }

    /**
     * @see com.lowagie.text.pdf.parser.TextAssemblyBuffer#getFinalText(PdfReader,
     *      int, TextAssembler, boolean)
     */
    @Override
    public FinalText getFinalText(PdfReader reader, int page,
            TextAssembler assembler, boolean useMarkup) {
        if (useMarkup) {
            return new FinalText(wordMarkup(getText(), reader, page, assembler));
        } else { 
            return new FinalText(getText() + " ");
        }
    }

    @Override
    public String toString() {
        return "[Word: [" + getText() + "] " + getStartPoint() + ", "
                + getEndPoint() + "] lead" + getAscent() + "]";
    }

    /**
     * @see com.lowagie.text.pdf.parser.ParsedTextImpl#shouldNotSplit()
     */
    @Override
    public boolean shouldNotSplit() {
        return shouldNotSplit;
    }

    /**
     * @see com.lowagie.text.pdf.parser.ParsedTextImpl#breakBefore()
     */
    @Override
    public boolean breakBefore() {
        return breakBefore;
    }
}