All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lowagie.text.pdf.parser.MarkedUpTextAssembler Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
/**
 * Copyright 2014 by Tizra Inc.
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 * 

* Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the License. *

* The Original Code is 'iText, a free JAVA-PDF library'. *

* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. * All Rights Reserved. * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. *

* Contributor(s): all the names of the contributors are added in the source code * where applicable. *

* Alternatively, the contents of this file may be used under the terms of the * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * provisions of LGPL are applicable instead of those above. If you wish to * allow use of your version of this file only under the terms of the LGPL * License and not to allow others to use your version of this file under * the MPL, indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by the LGPL. * If you do not delete the provisions above, a recipient may use your version * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. *

* This library is free software; you can redistribute it and/or modify it * under the terms of the MPL as stated above or under the terms of the GNU * Library General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. *

* This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more * details. */ package com.lowagie.text.pdf.parser; import com.lowagie.text.pdf.PdfReader; import java.util.ArrayList; import java.util.List; /** * We'll get called on a variety of marked section content (perhaps including * the results of nested sections), and will assemble it into an order as we * can. * * @author dgd * */ public class MarkedUpTextAssembler implements TextAssembler { /** * our result may be partially processed already, in which case we'll just * add things to it, once ready. */ List result = new ArrayList<>(); private PdfReader reader; private ParsedTextImpl inProgress = null; private int page; private int wordIdCounter = 1; private boolean usePdfMarkupElements = false; /** * as we get new content (final or not), we accumulate it until we reach the * end of a parsing unit * * Each parsing unit may have a tag name that should wrap its content */ private List partialWords = new ArrayList<>(); MarkedUpTextAssembler(PdfReader reader) { this.reader = reader; } MarkedUpTextAssembler(PdfReader reader, boolean usePdfMarkupElements) { this.reader = reader; this.usePdfMarkupElements = usePdfMarkupElements; } /** * Remember an unassembled chunk until we hit the end of this element, or we * hit an assembled chunk, and need to pull things together. * * @param unassembled * chunk of text rendering instruction to contribute to final * text */ @Override public void process(ParsedText unassembled, String contextName) { partialWords.addAll(unassembled.getAsPartialWords()); } /** * Slot fully-assembled chunk into our result at the current location. If * there are unassembled chunks waiting, assemble them first. * * @param completed * This is a chunk from a nested element */ @Override public void process(FinalText completed, String contextName) { clearAccumulator(); result.add(completed); } /** * {@inheritDoc} * @see com.lowagie.text.pdf.parser.TextAssembler#process(Word, String) */ @Override public void process(Word completed, String contextName) { partialWords.add(completed); } /** * */ private void clearAccumulator() { for (TextAssemblyBuffer partialWord : partialWords) { // Visit each partialWord, calling renderText partialWord.assemble(this); } partialWords.clear(); if (inProgress != null) { result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); inProgress = null; } } private FinalText concatenateResult(String containingElementName) { // null element means that this is a formatting artifact, not content. if (containingElementName == null) { // at some point, we may want to extract alternate text for some // artifacts. return null; } StringBuilder res = new StringBuilder(); if (usePdfMarkupElements && !containingElementName.isEmpty()) { res.append('<').append(containingElementName).append('>'); } for (FinalText item : result) { res.append(item.getText()); } // important, as the stuff buffered in the result is now used up! result.clear(); if (usePdfMarkupElements && !containingElementName.isEmpty()) { res.append("= 0) { containingElementName = containingElementName.substring(0, spacePos); } res.append(containingElementName).append('>'); } return new FinalText(res.toString()); } /** * {@inheritDoc} * @see com.lowagie.text.pdf.parser.TextAssembler#endParsingContext(String) */ @Override public FinalText endParsingContext(String containingElementName) { clearAccumulator(); return concatenateResult(containingElementName); } /** * * @see com.lowagie.text.pdf.parser.TextAssembler#reset() */ @Override public void reset() { result.clear(); partialWords.clear(); inProgress = null; } @Override public void renderText(FinalText finalText) { result.add(finalText); } /** * Captures text using a simplified algorithm for inserting hard returns and * spaces * * @see com.lowagie.text.pdf.parser.GraphicsState * @see com.lowagie.text.pdf.parser.Matrix */ @Override public void renderText(ParsedTextImpl partialWord) { boolean firstRender = inProgress == null; boolean hardReturn = false; if (firstRender) { inProgress = partialWord; return; } Vector start = partialWord.getStartPoint(); Vector lastStart = inProgress.getStartPoint(); Vector lastEnd = inProgress.getEndPoint(); // see // http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html float dist = inProgress.getBaseline() .subtract(lastStart) .cross(lastStart.subtract(start)) .lengthSquared() / inProgress.getBaseline().subtract(lastStart).lengthSquared(); float sameLineThreshold = partialWord.getAscent() * 0.5f; // let's try using 25% of current leading for vertical slop. if (dist > sameLineThreshold || Float.isNaN(dist)) { hardReturn = true; } /* * Note: Technically, we should check both the start and end positions, * in case the angle of the text changed without any displacement but * this sort of thing probably doesn't happen much in reality, so we'll * leave it alone for now */ float spacing = lastEnd.subtract(start).length(); if (hardReturn || partialWord.breakBefore()) { result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); if (hardReturn) { result.add(new FinalText("\n")); if (usePdfMarkupElements) { result.add(new FinalText("
")); } } inProgress = partialWord; // System.out.println("<< Hard Return >>"); } else if (spacing < partialWord.getSingleSpaceWidth() / 2.3 || inProgress.shouldNotSplit()) { inProgress = new Word(inProgress.getText() + partialWord.getText().trim(), partialWord.getAscent(), partialWord.getDescent(), lastStart, partialWord.getEndPoint(), inProgress.getBaseline(), partialWord.getSingleSpaceWidth(), inProgress.shouldNotSplit(), inProgress.breakBefore()); } else { result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); inProgress = partialWord; } } /** * Getter. * * @return reader */ protected PdfReader getReader() { return reader; } /** * {@inheritDoc} * @see com.lowagie.text.pdf.parser.TextAssembler#setPage(int) */ @Override public void setPage(int page) { this.page = page; } /** * {@inheritDoc} * @see com.lowagie.text.pdf.parser.TextAssembler#getWordId() */ @Override public String getWordId() { return "word" + wordIdCounter++; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy