All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.w3c.tidy.Clean Maven / Gradle / Ivy

/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.w3c.tidy;

/**
 * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
 * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
 * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
 * properties on the element, e.g.
 * 
 * {@code
 * 

* ... *

. *

* ... *

* } *
* Such rules are applied to the element's content and then to the element itself until none of the rules more apply. * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules * strip the element they apply to, replacing it by style properties on the contents, e.g. *
 * {@code
 * 
 * 
  • *

    * ...

  • *
    . *

    * ... * } *

    * These rules are applied to an element before processing its content and replace the current element by the first * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class * value and style rule in the document head. To support this, an association of styles and class names is built. A * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be * to first sort the properties before matching. * * @author Dave Raggett [email protected] * @author Andy Quick [email protected] (translation to Java) * @author Fabrizio Giustina * @version $Revision$ ($Author$) */ public class Clean { /** * sequential number for generated css classes. */ private int classNum; /** * Tag table. */ private TagTable tt; /** * Instantiates a new Clean. * * @param tagTable tag table instance */ public Clean(TagTable tagTable) { this.tt = tagTable; } /** * Insert a css style property. * * @param props StyleProp instance * @param name property name * @param value property value * @return StyleProp containin the given property */ private StyleProp insertProperty(StyleProp props, String name, String value) { StyleProp first, prev, prop; int cmp; prev = null; first = props; while (props != null) { cmp = props.name.compareTo(name); if (cmp == 0) { // this property is already defined, ignore new value return first; } if (cmp > 0) // props.name > name { // insert before this prop = new StyleProp(name, value, props); if (prev != null) { prev.next = prop; } else { first = prop; } return first; } prev = props; props = props.next; } prop = new StyleProp(name, value, null); if (prev != null) { prev.next = prop; } else { first = prop; } return first; } /** * Create sorted linked list of properties from style string. * * @param prop StyleProp * @param style style string * @return StyleProp with given style */ private StyleProp createProps(StyleProp prop, String style) { int nameEnd; int valueEnd; int valueStart = 0; int nameStart = 0; boolean more; nameStart = 0; while (nameStart < style.length()) { while (nameStart < style.length() && style.charAt(nameStart) == ' ') { ++nameStart; } nameEnd = nameStart; while (nameEnd < style.length()) { if (style.charAt(nameEnd) == ':') { valueStart = nameEnd + 1; break; } ++nameEnd; } if (nameEnd >= style.length() || style.charAt(nameEnd) != ':') { break; } while (valueStart < style.length() && style.charAt(valueStart) == ' ') { ++valueStart; } valueEnd = valueStart; more = false; while (valueEnd < style.length()) { if (style.charAt(valueEnd) == ';') { more = true; break; } ++valueEnd; } prop = insertProperty(prop, style.substring(nameStart, nameEnd), style.substring(valueStart, valueEnd)); if (more) { nameStart = valueEnd + 1; continue; } break; } return prop; } /** * Create a css property. * * @param props StyleProp * @return css property as String */ private String createPropString(StyleProp props) { String style = ""; int len; StyleProp prop; // compute length for (len = 0, prop = props; prop != null; prop = prop.next) { len += prop.name.length() + 2; len += prop.value.length() + 2; } for (prop = props; prop != null; prop = prop.next) { style = style.concat(prop.name); style = style.concat(": "); style = style.concat(prop.value); if (prop.next == null) { break; } style = style.concat("; "); } return style; } /** * Creates a string with merged properties. * * @param style css style * @param property css properties * @return merged string */ private String addProperty(String style, String property) { StyleProp prop; prop = createProps(null, style); prop = createProps(prop, property); style = createPropString(prop); return style; } /** * Generates a new css class name. * * @param lexer Lexer * @return generated css class */ private String gensymClass(final Lexer lexer) { String pfx = lexer.configuration.cssPrefix; if (pfx == null) { pfx = "c"; } return pfx + ++classNum; } /** * Finds a css style. * * @param lexer Lexer * @param tag tag name * @param properties css properties * @return style string */ private String findStyle(Lexer lexer, String tag, String properties) { Style style; for (style = lexer.styles; style != null; style = style.next) { if (style.tag.equals(tag) && style.properties.equals(properties)) { return style.tagClass; } } style = new Style(tag, gensymClass(lexer), properties, lexer.styles); lexer.styles = style; return style.tagClass; } /** * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute. * * @param lexer Lexer * @param node node with a style attribute */ private void style2Rule(Lexer lexer, Node node) { AttVal styleattr, classattr; String classname; styleattr = node.getAttrByName("style"); if (styleattr != null) { classname = findStyle(lexer, node.element, styleattr.value); classattr = node.getAttrByName("class"); // if there already is a class attribute then append class name after a space if (classattr != null) { classattr.value = classattr.value + " " + classname; node.removeAttribute(styleattr); } else { // reuse style attribute for class attribute styleattr.attribute = "class"; styleattr.value = classname; } } } /** * Adds a css rule for color. * * @param lexer Lexer * @param selector css selector * @param color color value */ private void addColorRule(Lexer lexer, String selector, String color) { if (color != null) { lexer.addStringLiteral(selector); lexer.addStringLiteral(" { color: "); lexer.addStringLiteral(color); lexer.addStringLiteral(" }\n"); } } /** * Move presentation attribs from body to style element. * *
         * background="foo" . body { background-image: url(foo) }
         * bgcolor="foo" . body { background-color: foo }
         * text="foo" . body { color: foo }
         * link="foo" . :link { color: foo }
         * vlink="foo" . :visited { color: foo }
         * alink="foo" . :active { color: foo }
         * 
    * * @param lexer Lexer * @param body body node */ private void cleanBodyAttrs(Lexer lexer, Node body) { AttVal attr; String bgurl = null; String bgcolor = null; String color = null; attr = body.getAttrByName("background"); if (attr != null) { bgurl = attr.value; attr.value = null; body.removeAttribute(attr); } attr = body.getAttrByName("bgcolor"); if (attr != null) { bgcolor = attr.value; attr.value = null; body.removeAttribute(attr); } attr = body.getAttrByName("text"); if (attr != null) { color = attr.value; attr.value = null; body.removeAttribute(attr); } if (bgurl != null || bgcolor != null || color != null) { lexer.addStringLiteral(" body {\n"); if (bgurl != null) { lexer.addStringLiteral(" background-image: url("); lexer.addStringLiteral(bgurl); lexer.addStringLiteral(");\n"); } if (bgcolor != null) { lexer.addStringLiteral(" background-color: "); lexer.addStringLiteral(bgcolor); lexer.addStringLiteral(";\n"); } if (color != null) { lexer.addStringLiteral(" color: "); lexer.addStringLiteral(color); lexer.addStringLiteral(";\n"); } lexer.addStringLiteral(" }\n"); } attr = body.getAttrByName("link"); if (attr != null) { addColorRule(lexer, " :link", attr.value); body.removeAttribute(attr); } attr = body.getAttrByName("vlink"); if (attr != null) { addColorRule(lexer, " :visited", attr.value); body.removeAttribute(attr); } attr = body.getAttrByName("alink"); if (attr != null) { addColorRule(lexer, " :active", attr.value); body.removeAttribute(attr); } } /** * Check deprecated attributes in body tag. * * @param lexer Lexer * @param doc document root node * @return true is the body doesn't contain deprecated attributes, false otherwise. */ private boolean niceBody(Lexer lexer, Node doc) { Node body = doc.findBody(lexer.configuration.tt); if (body != null) { if (body.getAttrByName("background") != null || body.getAttrByName("bgcolor") != null || body.getAttrByName("text") != null || body.getAttrByName("link") != null || body.getAttrByName("vlink") != null || body.getAttrByName("alink") != null) { lexer.badLayout |= Report.USING_BODY; return false; } } return true; } /** * Create style element using rules from dictionary. * * @param lexer Lexer * @param doc root node */ private void createStyleElement(Lexer lexer, Node doc) { Node node, head, body; Style style; AttVal av; if (lexer.styles == null && niceBody(lexer, doc)) { return; } node = lexer.newNode(Node.START_TAG, null, 0, 0, "style"); node.implicit = true; // insert type attribute av = new AttVal(null, null, '"', "type", "text/css"); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; body = doc.findBody(lexer.configuration.tt); lexer.txtstart = lexer.lexsize; if (body != null) { cleanBodyAttrs(lexer, body); } for (style = lexer.styles; style != null; style = style.next) { lexer.addCharToLexer(' '); lexer.addStringLiteral(style.tag); lexer.addCharToLexer('.'); lexer.addStringLiteral(style.tagClass); lexer.addCharToLexer(' '); lexer.addCharToLexer('{'); lexer.addStringLiteral(style.properties); lexer.addCharToLexer('}'); lexer.addCharToLexer('\n'); } lexer.txtend = lexer.lexsize; node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE, lexer.lexbuf, lexer.txtstart, lexer.txtend)); // now insert style element into document head doc is root node. search its children for html node the head // node should be first child of html node head = doc.findHEAD(lexer.configuration.tt); if (head != null) { head.insertNodeAtEnd(node); } } /** * Ensure bidirectional links are consistent. * * @param node root node */ private void fixNodeLinks(Node node) { Node child; if (node.prev != null) { node.prev.next = node; } else { node.parent.content = node; } if (node.next != null) { node.next.prev = node; } else { node.parent.last = node; } for (child = node.content; child != null; child = child.next) { child.parent = node; } } /** * Used to strip child of node when the node has one and only one child. * * @param node parent node */ private void stripOnlyChild(Node node) { Node child; child = node.content; node.content = child.content; node.last = child.last; child.content = null; for (child = node.content; child != null; child = child.next) { child.parent = node; } } /** * Used to strip font start and end tags. * * @param element original node * @param pnode passed in as array to allow modification. pnode[0] will contain the final node * TODO remove the pnode parameter and make it a return value */ private void discardContainer(Node element, Node[] pnode) { Node node; Node parent = element.parent; if (element.content != null) { element.last.next = element.next; if (element.next != null) { element.next.prev = element.last; element.last.next = element.next; } else { parent.last = element.last; } if (element.prev != null) { element.content.prev = element.prev; element.prev.next = element.content; } else { parent.content = element.content; } for (node = element.content; node != null; node = node.next) { node.parent = parent; } pnode[0] = element.content; } else { if (element.next != null) { element.next.prev = element.prev; } else { parent.last = element.prev; } if (element.prev != null) { element.prev.next = element.next; } else { parent.content = element.next; } pnode[0] = element.next; } element.next = null; element.content = null; } /** * Add style property to element, creating style attribute as needed and adding ; delimiter. * * @param node node * @param property property added to node */ private void addStyleProperty(Node node, String property) { AttVal av; for (av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) { break; } } // if style attribute already exists then insert property if (av != null) { String s; s = addProperty(av.value, property); av.value = s; } else { // else create new style attribute av = new AttVal(node.attributes, null, '"', "style", property); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; } } /** * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build * a linked list of property/values and insert properties into the list in order, merging values for the same * property name. * * @param s1 first property * @param s2 second property * @return merged properties */ private String mergeProperties(String s1, String s2) { String s; StyleProp prop; prop = createProps(null, s1); prop = createProps(prop, s2); s = createPropString(prop); return s; } /** * Merge class attributes from 2 nodes. * * @param node Node * @param child Child node */ private void mergeClasses(Node node, Node child) { AttVal av; String s1, s2, names; for (s2 = null, av = child.attributes; av != null; av = av.next) { if ("class".equals(av.attribute)) { s2 = av.value; break; } } for (s1 = null, av = node.attributes; av != null; av = av.next) { if ("class".equals(av.attribute)) { s1 = av.value; break; } } if (s1 != null) { if (s2 != null) // merge class names from both { names = s1 + ' ' + s2; av.value = names; } } else if (s2 != null) // copy class names from child { av = new AttVal(node.attributes, null, '"', "class", s2); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; } } /** * Merge style from 2 nodes. * * @param node Node * @param child Child node */ private void mergeStyles(Node node, Node child) { AttVal av; String s1, s2, style; // the child may have a class attribute used for attaching styles, if so the class name needs to be copied to // node's class mergeClasses(node, child); for (s2 = null, av = child.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) { s2 = av.value; break; } } for (s1 = null, av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) { s1 = av.value; break; } } if (s1 != null) { if (s2 != null) // merge styles from both { style = mergeProperties(s1, s2); av.value = style; } } else if (s2 != null) // copy style of child { av = new AttVal(node.attributes, null, '"', "style", s2); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; } } /** * Map a % font size to a named font size. * * @param size size in % * @return font size name */ private String fontSize2Name(String size) { String[] sizes = {"60%", "70%", "80%", null, "120%", "150%", "200%"}; String buf; if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') { int n = size.charAt(0) - '0'; return sizes[n]; } if (size.length() > 0 && size.charAt(0) == '-') { if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') { int n = size.charAt(1) - '0'; double x; for (x = 1.0; n > 0; --n) { x *= 0.8; } x *= 100.0; buf = "" + (int) x + "%"; return buf; } return "smaller"; /* "70%"; */ } if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') { int n = size.charAt(1) - '0'; double x; for (x = 1.0; n > 0; --n) { x *= 1.2; } x *= 100.0; buf = "" + (int) x + "%"; return buf; } return "larger"; /* "140%" */ } /** * Adds a font-family style. * * @param node Node * @param face font face */ private void addFontFace(Node node, String face) { addStyleProperty(node, "font-family: " + face); } /** * Adds a font size style. * * @param node Node * @param size font size */ private void addFontSize(Node node, String size) { if (size == null) { return; } if ("6".equals(size) && node.tag == this.tt.tagP) { node.element = "h1"; this.tt.findTag(node); return; } if ("5".equals(size) && node.tag == this.tt.tagP) { node.element = "h2"; this.tt.findTag(node); return; } if ("4".equals(size) && node.tag == this.tt.tagP) { node.element = "h3"; this.tt.findTag(node); return; } String value = fontSize2Name(size); if (value != null) { addStyleProperty(node, "font-size: " + value); } } /** * Adds a font color style. * * @param node Node * @param color color value */ private void addFontColor(Node node, String color) { addStyleProperty(node, "color: " + color); } /** * Adds an align style. * * @param node Node * @param align align value */ private void addAlign(Node node, String align) { // force alignment value to lower case addStyleProperty(node, "text-align: " + align.toLowerCase()); } /** * Add style properties to node corresponding to the font face, size and color attributes. * * @param node font tag * @param av attribute list for node */ private void addFontStyles(Node node, AttVal av) { while (av != null) { switch (av.attribute) { case "face": addFontFace(node, av.value); break; case "size": addFontSize(node, av.value); break; case "color": addFontColor(node, av.value); break; default: break; } av = av.next; } } /** * Symptom: <p align=center>. Action: <p style="text-align: center">. * * @param lexer Lexer * @param node node with center attribute. Will be modified to use css style. */ private void textAlign(Lexer lexer, Node node) { AttVal av, prev; prev = null; for (av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("align")) { if (prev != null) { prev.next = av.next; } else { node.attributes = av.next; } if (av.value != null) { addAlign(node, av.value); } break; } prev = av; } } /* Symptom: Action:
    */ private void tableBgColor(final Node node) { final AttVal attr = node.getAttrByName("bgcolor"); if (null != attr) { node.removeAttribute(attr); addStyleProperty(node, "background-color: " + attr.value); } } /** * Symptom: <dir><li> where <li> is only child. Action: coerce * <dir> <li> to <div> with indent. The clean up rules use the pnode argument * to return the next node when the original node has been deleted. * * @param lexer Lexer * @param node dir tag * @return true if a dir tag has been coerced to a div */ private boolean dir2Div(Lexer lexer, Node node) { Node child; if (node.tag == this.tt.tagDir || node.tag == this.tt.tagUl || node.tag == this.tt.tagOl) { child = node.content; if (child == null) { return false; } // check child has no peers if (child.next != null) { return false; } if (child.tag != this.tt.tagLi) { return false; } if (!child.implicit) { return false; } // coerce dir to div node.tag = this.tt.tagDiv; node.element = "div"; addStyleProperty(node, "margin-left: 2em"); stripOnlyChild(node); return true; } return false; } /** * Symptom: * *
         * <center>
         * 
    . *

    * Action: replace <center> by <div style="text-align: center"> *

    * * @param lexer Lexer * @param node center tag * @param pnode pnode[0] is the same as node, passed in as an array to allow modification * @return true if a center tag has been replaced by a div */ private boolean center2Div(Lexer lexer, Node node, Node[] pnode) { if (node.tag == this.tt.tagCenter) { if (lexer.configuration.dropFontTags) { if (node.content != null) { Node last = node.last; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); if (last.next != null) { last.next.prev = node; } node.next = last.next; last.next = node; node.prev = last; if (parent.last == last) { parent.last = node; } node.parent = parent; } else { Node prev = node.prev; Node next = node.next; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); node.next = next; node.prev = prev; node.parent = parent; if (next != null) { next.prev = node; } else { parent.last = node; } if (prev != null) { prev.next = node; } else { parent.content = node; } } return true; } node.tag = this.tt.tagDiv; node.element = "div"; addStyleProperty(node, "text-align: center"); return true; } return false; } /** * Symptom: <div><div>...</div></div> Action: merge the two divs. This is useful after * nested <dir>s used by Word for indenting have been converted to <div>s. * * @param lexer Lexer * @param node first div * @return true if the divs have been merged */ private boolean mergeDivs(Lexer lexer, Node node) { Node child; if (node.tag != this.tt.tagDiv) { return false; } child = node.content; if (child == null) { return false; } if (child.tag != this.tt.tagDiv) { return false; } if (child.next != null) { return false; } mergeStyles(node, child); stripOnlyChild(node); return true; } /** * Symptom: *
      *
    • *
        * ... *
      *
    • *
    * Action: discard outer list. * * @param lexer Lexer * @param node Node * @param pnode passed in as array to allow modifications. * @return true if nested lists have been found and replaced */ private boolean nestedList(Lexer lexer, Node node, Node[] pnode) { Node child, list; if (node.tag == this.tt.tagUl || node.tag == this.tt.tagOl) { child = node.content; if (child == null) { return false; } // check child has no peers if (child.next != null) { return false; } list = child.content; if (list == null) { return false; } if (list.tag != node.tag) { return false; } pnode[0] = list; // Set node to resume iteration // move inner list node into position of outer node list.prev = node.prev; list.next = node.next; list.parent = node.parent; fixNodeLinks(list); // get rid of outer ul and its li // XXX: Are we leaking the child node? -creitzel 7 Jun, 01 child.content = null; node.content = null; node.next = null; node = null; // If prev node was a list the chances are this node should be appended to that list. Word has no way of // recognizing nested lists and just uses indents if (list.prev != null) { if (list.prev.tag == this.tt.tagUl || list.prev.tag == this.tt.tagOl) { node = list; list = node.prev; list.next = node.next; if (list.next != null) { list.next.prev = list; } child = list.last; /*
  • */ node.parent = child; node.next = null; node.prev = child.last; fixNodeLinks(node); cleanNode(lexer, node); } } return true; } return false; } /** * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add * style "font-weight: bold" to the block and strip the <b>element, leaving its children. example: * *
         * <p>
         * <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
         * </p>
         * 
    *

    * becomes: * *

         * <p style="font-weight: bold; font-family: Arial; font-size: 6">
         * Draft Recommended Practice
         * </p>
         * 
    * *

    * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator * 4, this isn't done for the elements: caption, tr and table *

    * * @param lexer Lexer * @param node parent node * @return true if the child node has been removed */ private boolean blockStyle(Lexer lexer, Node node) { /* check for bgcolor */ if (node.tag == tt.tagTable || node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTr) { tableBgColor(node); } Node child; if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) { if (node.tag != this.tt.tagTable && node.tag != this.tt.tagTr && node.tag != this.tt.tagLi) { // check for align attribute if (node.tag != this.tt.tagCaption) { textAlign(lexer, node); } child = node.content; if (child == null) { return false; } // check child has no peers if (child.next != null) { return false; } if (child.tag == this.tt.tagB) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == this.tt.tagI) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == this.tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } } return false; } /** * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to * parent. * * @param lexer Lexer * @param node parent node * @param pnode passed as an array to allow modifications * @return true if child node has been stripped, replaced by style attributes. */ private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode) { Node child; if (node.tag != this.tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) { child = node.content; if (child == null) { return false; } // check child has no peers if (child.next != null) { return false; } if (child.tag == this.tt.tagB && lexer.configuration.logicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == this.tt.tagI && lexer.configuration.logicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == this.tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } return false; } /** * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single * style attribute. * * @param lexer Lexer * @param node font tag * @param pnode passed as an array to allow modifications * @return true if a font tag has been dropped and replaced by style attributes */ private boolean font2Span(Lexer lexer, Node node, Node[] pnode) { AttVal av, style, next; if (node.tag == this.tt.tagFont) { if (lexer.configuration.dropFontTags) { discardContainer(node, pnode); return false; } // if FONT is only child of parent element then leave alone if (node.parent.content == node && node.next == null) { return false; } addFontStyles(node, node.attributes); // extract style attribute and free the rest av = node.attributes; style = null; while (av != null) { next = av.next; if (av.attribute.equals("style")) { av.next = null; style = av; } av = next; } node.attributes = style; node.tag = this.tt.tagSpan; node.element = "span"; return true; } return false; } /** * Applies all matching rules to a node. * * @param lexer Lexer * @param node original node * @return cleaned up node */ private Node cleanNode(Lexer lexer, Node node) { Node next = null; Node[] o = new Node[1]; boolean b = false; for (next = node; node != null && node.isElement(); node = next) { o[0] = next; b = dir2Div(lexer, node); next = o[0]; if (b) { continue; } // Special case: true result means that arg node and its parent no longer exist. // So we must jump back up the CreateStyleProperties() call stack until we have a valid node reference. b = nestedList(lexer, node, o); next = o[0]; if (b) { return next; } b = center2Div(lexer, node, o); next = o[0]; if (b) { continue; } b = mergeDivs(lexer, node); next = o[0]; if (b) { continue; } b = blockStyle(lexer, node); next = o[0]; if (b) { continue; } b = inlineStyle(lexer, node, o); next = o[0]; if (b) { continue; } b = font2Span(lexer, node, o); next = o[0]; if (b) { continue; } break; } return next; } /** * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node * reference. * * @param lexer Lexer * @param node Node * @param prepl passed in as array to allow modifications * @return cleaned Node */ private Node createStyleProperties(Lexer lexer, Node node, Node[] prepl) { Node child = node.content; if (child != null) { Node[] repl = new Node[1]; repl[0] = node; while (child != null) { child = createStyleProperties(lexer, child, repl); if (repl[0] != node) { return repl[0]; } if (child != null) { child = child.next; } } } return cleanNode(lexer, node); } /** * Find style attribute in node content, and replace it by corresponding class attribute. * * @param lexer Lexer * @param node parent node */ private void defineStyleRules(Lexer lexer, Node node) { Node child; if (node.content != null) { child = node.content; while (child != null) { defineStyleRules(lexer, child); child = child.next; } } style2Rule(lexer, node); } /** * Clean an html tree. * * @param lexer Lexer * @param doc root node */ public void cleanTree(Lexer lexer, Node doc) { Node[] repl = new Node[1]; repl[0] = doc; doc = createStyleProperties(lexer, doc, repl); if (doc != null && lexer.configuration.makeClean) { defineStyleRules(lexer, doc); createStyleElement(lexer, doc); } } /** * simplifies {@literal ... ... } etc. * * @param node root Node */ public void nestedEmphasis(Node node) { Node[] o = new Node[1]; Node next; while (node != null) { next = node.next; if ((node.tag == this.tt.tagB || node.tag == this.tt.tagI) && node.parent != null && node.parent.tag == node.tag) { // strip redundant inner element o[0] = next; discardContainer(node, o); next = o[0]; node = next; continue; } if (node.content != null) { nestedEmphasis(node.content); } node = next; } } /** * Replace i by em and b by strong. * * @param node root Node */ public void emFromI(Node node) { while (node != null) { if (node.tag == this.tt.tagI) { node.element = this.tt.tagEm.name; node.tag = this.tt.tagEm; } else if (node.tag == this.tt.tagB) { node.element = this.tt.tagStrong.name; node.tag = this.tt.tagStrong; } if (node.content != null) { emFromI(node.content); } node = node.next; } } /** * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single * implicit li. This is recursively replaced by an implicit blockquote. * * @param node root Node */ public void list2BQ(Node node) { while (node != null) { if (node.content != null) { list2BQ(node.content); } if (node.tag != null && node.tag.getParser() == ParserImpl.LIST && node.hasOneChild() && node.content.implicit) { stripOnlyChild(node); node.element = this.tt.tagBlockquote.name; node.tag = this.tt.tagBlockquote; node.implicit = true; } node = node.next; } } /** * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with * the indent set to match the nesting depth. * * @param node root Node */ public void bQ2Div(Node node) { int indent; String indentBuf; AttVal attval; while (node != null) { if (node.tag == this.tt.tagBlockquote && node.implicit) { indent = 1; while (node.hasOneChild() && node.content.tag == this.tt.tagBlockquote && node.implicit) { ++indent; stripOnlyChild(node); } if (node.content != null) { bQ2Div(node.content); } indentBuf = "margin-left: " + (new Integer(2 * indent)).toString() + "em"; node.element = this.tt.tagDiv.name; node.tag = this.tt.tagDiv; attval = node.getAttrByName("style"); if (attval != null && attval.value != null) { attval.value = indentBuf + "; " + attval.value; } else { node.addAttribute("style", indentBuf); } } else if (node.content != null) { bQ2Div(node.content); } node = node.next; } } /** * Find the enclosing table cell for the given node. * * @param node Node * @return enclosing cell node */ Node findEnclosingCell(Node node) { Node check; for (check = node; check != null; check = check.parent) { if (check.tag == tt.tagTd) { return check; } } return null; } /** * node is <![if ...]> prune up to <![endif]>. * * @param lexer Lexer * @param node Node * @return cleaned up Node */ public Node pruneSection(Lexer lexer, Node node) { for (; ; ) { // FG: commented out - don't add   to empty cells // if ((Lexer.getString(node.textarray, node.start, 21)).equals("if !supportEmptyParas")) // { // Node cell = findEnclosingCell(node); // if (cell != null) // { // // Need to put   into cell so it doesn't look weird // char onesixty[] = {(char) 160, (char) 0}; // Node nbsp = lexer.newLiteralTextNode(lexer, onesixty); // Node.insertNodeBeforeElement(node, nbsp); // } // } // discard node and returns next node = Node.discardElement(node); if (node == null) { return null; } if (node.type == Node.SECTION_TAG) { if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if")) { node = pruneSection(lexer, node); continue; } if ((TidyUtils.getString(node.textarray, node.start, 5)).equals("endif")) { node = Node.discardElement(node); break; } } } return node; } /** * Drop if/endif sections inserted by word2000. * * @param lexer Lexer * @param node Node root node */ public void dropSections(Lexer lexer, Node node) { while (node != null) { if (node.type == Node.SECTION_TAG) { // prune up to matching endif if ((TidyUtils.getString(node.textarray, node.start, 2)).equals("if") && (!(TidyUtils.getString(node.textarray, node.start, 7)).equals("if !vml"))) // #444394 - fix 13 // Sep 01 { node = pruneSection(lexer, node); continue; } // discard others as well node = Node.discardElement(node); continue; } if (node.content != null) { dropSections(lexer, node.content); } node = node.next; } } /** * Remove word2000 attributes from node. * * @param node node to cleanup */ public void purgeWord2000Attributes(Node node) { AttVal attr = null; AttVal next = null; AttVal prev = null; for (attr = node.attributes; attr != null; attr = next) { next = attr.next; // special check for class="Code" denoting pre text // Pass thru user defined styles as HTML class names if (attr.attribute != null && attr.value != null && attr.attribute.equals("class")) { if (attr.value.equals("Code") || !attr.value.startsWith("Mso")) { prev = attr; continue; } } if (attr.attribute != null && (attr.attribute.equals("class") || attr.attribute.equals("style") || attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute .equals("width")) && // (node.tag == this.tt.tagTd || node.tag == this.tt.tagTr || node.tag == this.tt.tagTh)))) { if (prev != null) { prev.next = next; } else { node.attributes = next; } } else { prev = attr; } } } /** * Word2000 uses span excessively, so we strip span out. * * @param lexer Lexer * @param span Node span * @return cleaned node */ public Node stripSpan(Lexer lexer, Node span) { Node node; Node prev = null; Node content; // deal with span elements that have content by splicing the content in place of the span after having // processed it cleanWord2000(lexer, span.content); content = span.content; if (span.prev != null) { prev = span.prev; } else if (content != null) { node = content; content = content.next; node.removeNode(); Node.insertNodeBeforeElement(span, node); prev = node; } while (content != null) { node = content; content = content.next; node.removeNode(); prev.insertNodeAfterElement(node); prev = node; } if (span.next == null) { span.parent.last = prev; } node = span.next; span.content = null; Node.discardElement(span); return node; } /** * Map non-breaking spaces to regular spaces. * * @param lexer Lexer * @param node Node */ private void normalizeSpaces(Lexer lexer, Node node) { while (node != null) { if (node.content != null) { normalizeSpaces(lexer, node.content); } if (node.type == Node.TEXT_NODE) { int i; int[] c = new int[1]; int p = node.start; for (i = node.start; i < node.end; ++i) { c[0] = node.textarray[i]; // look for UTF-8 multibyte character if (c[0] > 0x7F) { i += PPrint.getUTF8(node.textarray, i, c); } if (c[0] == 160) { c[0] = ' '; } p = PPrint.putUTF8(node.textarray, p, c[0]); } } node = node.next; } } /** * Used to hunt for hidden preformatted sections. * * @param node checked node * @return true if the node has a "margin-top: 0" or "margin-bottom: 0" style */ boolean noMargins(Node node) { AttVal attval = node.getAttrByName("style"); if (attval == null || attval.value == null) { return false; } // search for substring "margin-top: 0" if (!attval.value.contains("margin-top: 0")) { return false; } // search for substring "margin-top: 0" return attval.value.contains("margin-bottom: 0"); } /** * Does element have a single space as its content? * * @param lexer Lexer * @param node checked node * @return true if the element has a single space as its content */ boolean singleSpace(Lexer lexer, Node node) { if (node.content != null) { node = node.content; if (node.next != null) { return false; } if (node.type != Node.TEXT_NODE) { return false; } if (((node.end - node.start) == 1) && lexer.lexbuf[node.start] == ' ') { return true; } if ((node.end - node.start) == 2) { int[] c = new int[1]; PPrint.getUTF8(lexer.lexbuf, node.start, c); return c[0] == 160; } } return false; } /** * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags, * such as o:p which needs to be declared as inline. * * @param lexer Lexer * @param node node to clean up */ public void cleanWord2000(Lexer lexer, Node node) { // used to a list from a sequence of bulletted p's Node list = null; while (node != null) { // get rid of Word's xmlns attributes if (node.tag == tt.tagHtml) { // check that it's a Word 2000 document if ((node.getAttrByName("xmlns:o") == null)) { return; } lexer.configuration.tt.freeAttrs(node); } // fix up preformatted sections by looking for a sequence of paragraphs with zero top/bottom margin if (node.tag == tt.tagP) { if (noMargins(node)) { Node pre; Node next; Node.coerceNode(lexer, node, tt.tagPre); purgeWord2000Attributes(node); if (node.content != null) { cleanWord2000(lexer, node.content); } pre = node; node = node.next; // continue to strip p's while (node.tag == tt.tagP && noMargins(node)) { next = node.next; node.removeNode(); pre.insertNodeAtEnd(lexer.newLineNode()); pre.insertNodeAtEnd(node); stripSpan(lexer, node); node = next; } } } if (node.tag != null && TidyUtils.toBoolean(node.tag.model & Dict.CM_BLOCK) && singleSpace(lexer, node)) { node = stripSpan(lexer, node); continue; } // discard Word's style verbiage if (node.tag == this.tt.tagStyle || node.tag == this.tt.tagMeta || node.type == Node.COMMENT_TAG) { node = Node.discardElement(node); continue; } // strip out all span and font tags Word scatters so liberally! if (node.tag == this.tt.tagSpan || node.tag == this.tt.tagFont) { node = stripSpan(lexer, node); continue; } if (node.tag == this.tt.tagLink) { AttVal attr = node.getAttrByName("rel"); if (attr != null && attr.value != null && attr.value.equals("File-List")) { node = Node.discardElement(node); continue; } } // discard empty paragraphs if (node.content == null && node.tag == this.tt.tagP) { node = Node.discardElement(node); continue; } if (node.tag == this.tt.tagP) { AttVal attr = node.getAttrByName("class"); AttVal atrStyle = node.getAttrByName("style"); // (JES) Sometimes Word marks a list item with the following hokie syntax //

    to

      ...
    // map

    to

      ...
    if (attr != null && attr.value != null && ((attr.value.equals("MsoListBullet") || attr.value.equals("MsoListNumber")) // || (atrStyle != null && (atrStyle.value.contains("mso-list:"))))) // 463066 - fix by Joel // Shafer 19 Sep 01 { Dict listType = tt.tagUl; if (attr.value.equals("MsoListNumber")) { listType = tt.tagOl; } Node.coerceNode(lexer, node, this.tt.tagLi); if (list == null || list.tag != listType) { list = lexer.inferredTag(listType.name); Node.insertNodeBeforeElement(node, list); } purgeWord2000Attributes(node); if (node.content != null) { cleanWord2000(lexer, node.content); } // remove node and append to contents of list node.removeNode(); list.insertNodeAtEnd(node); node = list; } // map sequence of

    to

     ... 
    else if (attr != null && attr.value != null && attr.value.equals("Code")) { Node br = lexer.newLineNode(); normalizeSpaces(lexer, node); if (list == null || list.tag != this.tt.tagPre) { list = lexer.inferredTag("pre"); Node.insertNodeBeforeElement(node, list); } // remove node and append to contents of list node.removeNode(); list.insertNodeAtEnd(node); stripSpan(lexer, node); list.insertNodeAtEnd(br); node = list.next; } else { list = null; } } else { list = null; } // strip out style and class attributes if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { purgeWord2000Attributes(node); } if (node.content != null) { cleanWord2000(lexer, node.content); } node = node.next; } } /** * Check if the current document is a converted Word document. * * @param root root Node * @return true if the document has been geenrated by Microsoft Word. */ public boolean isWord2000(Node root) { AttVal attval; Node node; Node head; Node html = root.findHTML(this.tt); if (html != null && html.getAttrByName("xmlns:o") != null) { return true; } head = root.findHEAD(tt); if (head == null) { // no HEAD, no return false; } // search for for (node = head.content; node != null; node = node.next) { if (node.tag != tt.tagMeta) { continue; } attval = node.getAttrByName("name"); if (attval == null || attval.value == null) { continue; } if (!"generator".equals(attval.value)) { continue; } attval = node.getAttrByName("content"); if (attval == null || attval.value == null) { continue; } if (attval.value.contains("Microsoft")) { return true; } } return false; } /** * Where appropriate move object elements from head to body. * * @param lexer Lexer * @param html html node */ static void bumpObject(Lexer lexer, Node html) { if (html == null) { return; } Node node, next, head = null, body = null; TagTable tt = lexer.configuration.tt; for (node = html.content; node != null; node = node.next) { if (node.tag == tt.tagHead) { head = node; } if (node.tag == tt.tagBody) { body = node; } } if (head != null && body != null) { for (node = head.content; node != null; node = next) { next = node.next; if (node.tag == tt.tagObject) { Node child; boolean bump = false; for (child = node.content; child != null; child = child.next) { // bump to body unless content is param if ((child.type == Node.TEXT_NODE && !node.isBlank(lexer)) || child.tag != tt.tagParam) { bump = true; break; } } if (bump) { node.removeNode(); body.insertNodeAtStart(node); } } } } } }