All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.openhtmltopdf.layout.WhitespaceStripper Maven / Gradle / Ivy

Go to download

Open HTML to PDF is a CSS 2.1 renderer written in Java. This artifact contains the core rendering and layout code.

There is a newer version: 1.1.4
Show newest version
/*
 * Copyright (c) 2004, 2005 Torbjoern Gannholm
 * Copyright (c) 2006 Wisconsin Court System
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
package com.openhtmltopdf.layout;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import com.openhtmltopdf.css.constants.CSSName;
import com.openhtmltopdf.css.constants.IdentValue;
import com.openhtmltopdf.css.style.CalculatedStyle;
import com.openhtmltopdf.render.InlineBox;

/**
 * @author Torbjoern Gannholm
 */
public class WhitespaceStripper {
    public final static String SPACE = " ";
    public final static String EOL = "\n";
    public final static char EOLC = '\n';
    
    public final static Pattern linefeed_space_collapse = Pattern.compile("\\s+\\n\\s+");//Pattern is thread-safe
    public final static Pattern linefeed_to_space = Pattern.compile("\\n");
    public final static Pattern tab_to_space = Pattern.compile("\\t");
    public final static Pattern space_collapse = Pattern.compile("(?: )+");
    public final static Pattern space_before_linefeed_collapse = Pattern.compile("[\\s&&[^\\n]]\\n");
    
    /**
     * Strips whitespace early in inline content generation. This can be done
     * because "whitespage" does not ally to :first-line and :first-letter. For
     * dynamic pseudo-classes we are allowed to choose which properties apply.
     * 
     * NOTE: The inlineContent parameter may be modified
     *
     * @param inlineContent the inline content to strip the whitespaces on
     */
    public static void stripInlineContent(List inlineContent) {
        boolean collapse = false;
        boolean allWhitespace = true;

        for (Styleable node : inlineContent) {
            if (node.getStyle().isInline()) {
                InlineBox iB = (InlineBox) node;
                boolean collapseNext = stripWhitespace(iB, collapse);
                if (! iB.isRemovableWhitespace()) {
                    allWhitespace = false;
                }
                
                collapse = collapseNext;
            } else {
                if (! canCollapseThrough(node)) {
                    allWhitespace = false;
                    collapse = false;
                }
            }
        }

        if (allWhitespace) {
            stripTextContent(inlineContent);
        }
    }
    
    private static boolean canCollapseThrough(Styleable styleable) {
        CalculatedStyle style = styleable.getStyle();
        return style.isFloated() || style.isAbsolute() || style.isFixed() || style.isRunning();
    }

    private static void stripTextContent(List stripped) {
        boolean onlyAnonymous = true;
        for (Styleable node : stripped) {
            if (node.getStyle().isInline()) {
                InlineBox iB = (InlineBox) node;
                if (iB.getElement() != null) {
                    onlyAnonymous = false;
                }
                
                iB.truncateText();
            }
        }
        
        if (onlyAnonymous) {
            for (Iterator i = stripped.iterator(); i.hasNext(); ) {
                Styleable node = i.next();
                if (node.getStyle().isInline()) {
                    i.remove();
                }
            }
        }
    }

    /**
     * this function strips all whitespace from the text according to the CSS
     * 2.1 spec on whitespace handling. It accounts for the different whitespace
     * settings like normal, nowrap, pre, etc
     *
     * @param iB              the InlineBox to strip. The text in it is
     *                        modified.
     * @return whether the next leading space should collapse or
     *         not.
     */
    private static boolean stripWhitespace(InlineBox iB, boolean collapseLeading) {

        IdentValue whitespace = iB.getStyle().getIdent(CSSName.WHITE_SPACE);
        
        String text = iB.getText();

        text = collapseWhitespace(iB, whitespace, text, collapseLeading);

        boolean collapseNext = (text.endsWith(SPACE) &&
                (whitespace == IdentValue.NORMAL || whitespace == IdentValue.NOWRAP || whitespace == IdentValue.PRE));

        iB.setText(text);
        if (text.trim().equals("")) {
            if (whitespace == IdentValue.NORMAL || whitespace == IdentValue.NOWRAP) {
                iB.setRemovableWhitespace(true);
            } else if (whitespace == IdentValue.PRE) {
                iB.setRemovableWhitespace(false);//actually unnecessary, is set to this by default
            } else if (text.indexOf(EOL) < 0) {//and whitespace.equals("pre-line"), the only one left
                iB.setRemovableWhitespace(true);
            }
        }
        return text.equals("") ? collapseLeading : collapseNext;
    }
    
    /**
     * Collapse whitespace for normal or no-wrap modes. Much faster (15x in simple testing)
     * than using multiple regular expressions.
     * 
     * NOTE: Slightly different behavior to using regular expressions as definition of space characters
     * differ, but I believe this is the correct definition according to CSS specifications.
     */
    private static String collapseWhitespaceNormalOrNoWrap(String text, boolean collapseLeading) {
		char[] chs = text.toCharArray();
		StringBuilder builder = new StringBuilder(chs.length);
		boolean spaceAdded = collapseLeading;
		
		for (int i = 0; i < chs.length; i++) {
			char ch = chs[i];
			
			if (spaceAdded) {
				if (ch != '\n' &&
					ch != '\t' &&
					ch != ' ') {
					builder.append(ch);
					spaceAdded = false;
				}
			} else {
				if (ch == '\n' ||
					ch == '\t' ||
					ch == ' ') {
					builder.append(' ');
					spaceAdded = true;
				} else {
					builder.append(ch);
				}
			}
		}
		
		return builder.toString();
	}

    private static String collapseWhitespace(InlineBox iB, IdentValue whitespace, String text, boolean collapseLeading) {
        if (whitespace == IdentValue.NORMAL || whitespace == IdentValue.NOWRAP) {
            return collapseWhitespaceNormalOrNoWrap(text, collapseLeading);
        } else if (whitespace == IdentValue.PRE) {
            text = space_before_linefeed_collapse.matcher(text).replaceAll(EOL);
        }

        if (whitespace == IdentValue.PRE || whitespace == IdentValue.PRE_WRAP) {
            int tabSize = (int) iB.getStyle().asFloat(CSSName.TAB_SIZE);
            char[] tabs = new char[tabSize];
            Arrays.fill(tabs, ' ');
            text = tab_to_space.matcher(text).replaceAll(new String(tabs));
        } else if (whitespace == IdentValue.PRE_LINE) {
            text = tab_to_space.matcher(text).replaceAll(SPACE);
            text = space_collapse.matcher(text).replaceAll(SPACE);
        }

        return text;
    }
}






© 2015 - 2024 Weber Informatics LLC | Privacy Policy