org.apache.xml.serialize.OutputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xercesImpl Show documentation
Xerces2 is the next generation of high performance, fully compliant XML parsers in the Apache Xerces family. This new version of Xerces introduces the Xerces Native Interface (XNI), a complete framework for building parser components and configurations that is extremely modular and easy to program. The Apache Xerces2 parser is the reference implementation of XNI but other parser components, configurations, and parsers can be written using the Xerces Native Interface. For complete design and implementation documents, refer to the XNI Manual. Xerces2 is a fully conforming XML Schema 1.0 processor. A partial experimental implementation of the XML Schema 1.1 Structures and Datatypes Working Drafts (December 2009) and an experimental implementation of the XML Schema Definition Language (XSD): Component Designators (SCD) Candidate Recommendation (January 2010) are provided for evaluation. For more information, refer to the XML Schema page. Xerces2 also provides a complete implementation of the Document Object Model Level 3 Core and Load/Save W3C Recommendations and provides a complete implementation of the XML Inclusions (XInclude) W3C Recommendation. It also provides support for OASIS XML Catalogs v1.1. Xerces2 is able to parse documents written according to the XML 1.1 Recommendation, except that it does not yet provide an option to enable normalization checking as described in section 2.13 of this specification. It also handles namespaces according to the XML Namespaces 1.1 Recommendation, and will correctly serialize XML 1.1 documents if the DOM level 3 load/save APIs are in use.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


// Aug 21, 2000:
//  Added ability to omit DOCTYPE declaration.
//  Reported by Lars Martin 
// Aug 25, 2000:
//  Added ability to omit comments.
//  Contributed by Anupam Bagchi 


package org.apache.xml.serialize;


import java.io.UnsupportedEncodingException;

import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;


/**
 * Specifies an output format to control the serializer. Based on the
 * XSLT specification for output format, plus additional parameters.
 * Used to select the suitable serializer and determine how the
 * document should be formatted on output.
 * 
 * The two interesting constructors are:
 * 

 * {@link #OutputFormat(String,String,boolean)} creates a format
 *  for the specified method (XML, HTML, Text, etc), encoding and indentation
 * 
{@link #OutputFormat(Document,String,boolean)} creates a format
 *  compatible with the document type (XML, HTML, Text, etc), encoding and
 *  indentation
 * 
 *
 *
 * @deprecated This class was deprecated in Xerces 2.9.0. It is recommended 
 * that new applications use the DOM Level 3 LSSerializer or JAXP's Transformation 
 * API for XML (TrAX) for serializing XML and HTML. See the Xerces documentation for 
 * more information.
 * @version $Revision: 476048 $ $Date: 2006-11-17 05:32:47 +0100 (Fri, 17 Nov 2006) $
 * @author Assaf Arkin
 *         Keith Visco
 * @see Serializer
 * @see Method
 * @see LineSeparator
 */
public class OutputFormat
{
    /** 
     * @deprecated This class was deprecated in Xerces 2.9.0. It is recommended 
     * that new applications use the DOM Level 3 LSSerializer or JAXP's Transformation 
     * API for XML (TrAX) for serializing XML and HTML. See the Xerces documentation for 
     * more information.
     */
    public static class DTD
    {

        /**
         * Public identifier for HTML 4.01 (Strict) document type.
         */
        public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";

        /**
         * System identifier for HTML 4.01 (Strict) document type.
         */
        public static final String HTMLSystemId =
            "http://www.w3.org/TR/html4/strict.dtd";

        /**
         * Public identifier for XHTML 1.0 (Strict) document type.
         */
        public static final String XHTMLPublicId =
            "-//W3C//DTD XHTML 1.0 Strict//EN";

        /**
         * System identifier for XHTML 1.0 (Strict) document type.
         */
        public static final String XHTMLSystemId =
            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";

    }

    /** 
     * @deprecated This class was deprecated in Xerces 2.9.0. It is recommended 
     * that new applications use the DOM Level 3 LSSerializer or JAXP's Transformation 
     * API for XML (TrAX) for serializing XML and HTML. See the Xerces documentation for 
     * more information.
     */
    public static class Defaults
    {

        /**
         * If indentation is turned on, the default identation
         * level is 4.
         *
         * @see #setIndenting(boolean)
         */
        public static final int Indent = 4;

        /**
         * The default encoding for Web documents it UTF-8.
         *
         * @see #getEncoding()
         */
        public static final String Encoding = "UTF-8";

        /**
         * The default line width at which to break long lines
         * when identing. This is set to 72.
         */
        public static final int LineWidth = 72;

    }


    /**
     * Holds the output method specified for this document,
     * or null if no method was specified.
     */
    private String _method;


    /**
     * Specifies the version of the output method.
     */
    private String _version;


    /**
     * The indentation level, or zero if no indentation
     * was requested.
     */
    private int _indent = 0;


    /**
     * The encoding to use, if an input stream is used.
     * The default is always UTF-8.
     */
    private String _encoding = Defaults.Encoding;

    /**
     * The EncodingInfo instance for _encoding.
     */
    private EncodingInfo _encodingInfo = null;

    // whether java names for encodings are permitted
    private boolean _allowJavaNames = false;

    /**
     * The specified media type or null.
     */
    private String _mediaType;


    /**
     * The specified document type system identifier, or null.
     */
    private String _doctypeSystem;


    /**
     * The specified document type public identifier, or null.
     */
    private String _doctypePublic;


    /**
     * True if the XML declaration should be ommited;
     */
    private boolean _omitXmlDeclaration = false;


    /**
     * True if the DOCTYPE declaration should be ommited;
     */
    private boolean _omitDoctype = false;


    /**
     * True if comments should be ommited;
     */
    private boolean _omitComments = false;


    /**
     * True if the document type should be marked as standalone.
     */
    private boolean _standalone = false;


    /**
     * List of element tag names whose text node children must
     * be output as CDATA.
     */
    private String[] _cdataElements;


    /**
     * List of element tag names whose text node children must
     * be output unescaped.
     */
    private String[] _nonEscapingElements;


    /**
     * The selected line separator.
     */
    private String _lineSeparator = LineSeparator.Web;


    /**
     * The line width at which to wrap long lines when indenting.
     */
    private int _lineWidth = Defaults.LineWidth;


    /**
     * True if spaces should be preserved in elements that do not
     * specify otherwise, or specify the default behavior.
     */
    private boolean _preserve = false;
	/** If true, an empty string valued attribute is output as "". If false and
	 * and we are using the HTMLSerializer, then only the attribute name is 
	 * serialized. Defaults to false for backwards compatibility.
	 */
	private boolean _preserveEmptyAttributes = false;

    /**
     * Constructs a new output format with the default values.
     */
    public OutputFormat()
    {
    }


    /**
     * Constructs a new output format with the default values for
     * the specified method and encoding. If indent
     * is true, the document will be pretty printed with the default
     * indentation level and default line wrapping.
     *
     * @param method The specified output method
     * @param encoding The specified encoding
     * @param indenting True for pretty printing
     * @see #setEncoding
     * @see #setIndenting
     * @see #setMethod
     */
    public OutputFormat( String method, String encoding, boolean indenting )
    {
        setMethod( method );
        setEncoding( encoding );
        setIndenting( indenting );
    }


    /**
     * Constructs a new output format with the proper method,
     * document type identifiers and media type for the specified
     * document.
     *
     * @param doc The document to output
     * @see #whichMethod
     */
    public OutputFormat( Document doc )
    {
        setMethod( whichMethod( doc ) );
        setDoctype( whichDoctypePublic( doc ), whichDoctypeSystem( doc ) );
        setMediaType( whichMediaType( getMethod() ) );
    }


    /**
     * Constructs a new output format with the proper method,
     * document type identifiers and media type for the specified
     * document, and with the specified encoding. If indent
     * is true, the document will be pretty printed with the default
     * indentation level and default line wrapping.
     *
     * @param doc The document to output
     * @param encoding The specified encoding
     * @param indenting True for pretty printing
     * @see #setEncoding
     * @see #setIndenting
     * @see #whichMethod
     */
    public OutputFormat( Document doc, String encoding, boolean indenting )
    {
        this( doc );
        setEncoding( encoding );
        setIndenting( indenting );
    }


    /**
     * Returns the method specified for this output format.
     * Typically the method will be xml, html
     * or text, but it might be other values.
     * If no method was specified, null will be returned
     * and the most suitable method will be determined for
     * the document by calling {@link #whichMethod}.
     *
     * @return The specified output method, or null
     */
    public String getMethod()
    {
        return _method;
    }


    /**
     * Sets the method for this output format.
     *
     * @see #getMethod
     * @param method The output method, or null
     */
    public void setMethod( String method )
    {
        _method = method;
    }


    /**
     * Returns the version for this output method.
     * If no version was specified, will return null
     * and the default version number will be used.
     * If the serializer does not support that particular
     * version, it should default to a supported version.
     *
     * @return The specified method version, or null
     */
    public String getVersion()
    {
        return _version;
    }


    /**
     * Sets the version for this output method.
     * For XML the value would be "1.0", for HTML
     * it would be "4.0".
     *
     * @see #getVersion
     * @param version The output method version, or null
     */
    public void setVersion( String version )
    {
        _version = version;
    }


    /**
     * Returns the indentation specified. If no indentation
     * was specified, zero is returned and the document
     * should not be indented.
     *
     * @return The indentation or zero
     * @see #setIndenting
     */
    public int getIndent()
    {
        return _indent;
    }


    /**
     * Returns true if indentation was specified.
     */
    public boolean getIndenting()
    {
        return ( _indent > 0 );
    }


    /**
     * Sets the indentation. The document will not be
     * indented if the indentation is set to zero.
     * Calling {@link #setIndenting} will reset this
     * value to zero (off) or the default (on).
     *
     * @param indent The indentation, or zero
     */
    public void setIndent( int indent )
    {
        if ( indent < 0 )
            _indent = 0;
        else
            _indent = indent;
    }


    /**
     * Sets the indentation on and off. When set on, the default
     * indentation level and default line wrapping is used
     * (see {@link Defaults#Indent} and {@link Defaults#LineWidth}).
     * To specify a different indentation level or line wrapping,
     * use {@link #setIndent} and {@link #setLineWidth}.
     *
     * @param on True if indentation should be on
     */
    public void setIndenting( boolean on )
    {
        if ( on ) {
            _indent = Defaults.Indent;
            _lineWidth = Defaults.LineWidth;
        } else {
            _indent = 0;
            _lineWidth = 0;
        }
    }


    /**
     * Returns the specified encoding. If no encoding was
     * specified, the default is always "UTF-8".
     *
     * @return The encoding
     */
    public String getEncoding()
    {
        return _encoding;
    }


    /**
     * Sets the encoding for this output method. If no
     * encoding was specified, the default is always "UTF-8".
     * Make sure the encoding is compatible with the one
     * used by the {@link java.io.Writer}.
     *
     * @see #getEncoding
     * @param encoding The encoding, or null
     */
    public void setEncoding( String encoding )
    {
        _encoding = encoding;
        _encodingInfo = null;
    }

    /**
     * Sets the encoding for this output method with an EncodingInfo
     * instance.
     */
    public void setEncoding(EncodingInfo encInfo) {
        _encoding = encInfo.getIANAName();
        _encodingInfo = encInfo;
    }

    /**
     * Returns an EncodingInfo instance for the encoding.
     *
     * @see #setEncoding
     */
    public EncodingInfo getEncodingInfo() throws UnsupportedEncodingException {
        if (_encodingInfo == null)
            _encodingInfo = Encodings.getEncodingInfo(_encoding, _allowJavaNames);
        return _encodingInfo;
    }

    /**
     * Sets whether java encoding names are permitted
     */
    public void setAllowJavaNames (boolean allow) {
        _allowJavaNames = allow;
    }

    /**
     * Returns whether java encoding names are permitted
     */
    public boolean setAllowJavaNames () {
        return _allowJavaNames;
    }

    /**
     * Returns the specified media type, or null.
     * To determine the media type based on the
     * document type, use {@link #whichMediaType}.
     *
     * @return The specified media type, or null
     */
    public String getMediaType()
    {
        return _mediaType;
    }


    /**
     * Sets the media type.
     *
     * @see #getMediaType
     * @param mediaType The specified media type
     */
    public void setMediaType( String mediaType )
    {
        _mediaType = mediaType;
    }


    /**
     * Sets the document type public and system identifiers.
     * Required only if the DOM Document or SAX events do not
     * specify the document type, and one must be present in
     * the serialized document. Any document type specified
     * by the DOM Document or SAX events will override these
     * values.
     *
     * @param publicId The public identifier, or null
     * @param systemId The system identifier, or null
     */
    public void setDoctype( String publicId, String systemId )
    {
        _doctypePublic = publicId;
        _doctypeSystem = systemId;
    }


    /**
     * Returns the specified document type public identifier,
     * or null.
     */
    public String getDoctypePublic()
    {
        return _doctypePublic;
    }


    /**
     * Returns the specified document type system identifier,
     * or null.
     */
    public String getDoctypeSystem()
    {
        return _doctypeSystem;
    }


    /**
     * Returns true if comments should be ommited.
     * The default is false.
     */
    public boolean getOmitComments()
    {
        return _omitComments;
    }


    /**
     * Sets comment omitting on and off.
     *
     * @param omit True if comments should be ommited
     */
    public void setOmitComments( boolean omit )
    {
        _omitComments = omit;
    }


    /**
     * Returns true if the DOCTYPE declaration should
     * be ommited. The default is false.
     */
    public boolean getOmitDocumentType()
    {
        return _omitDoctype;
    }


    /**
     * Sets DOCTYPE declaration omitting on and off.
     *
     * @param omit True if DOCTYPE declaration should be ommited
     */
    public void setOmitDocumentType( boolean omit )
    {
        _omitDoctype = omit;
    }


    /**
     * Returns true if the XML document declaration should
     * be ommited. The default is false.
     */
    public boolean getOmitXMLDeclaration()
    {
        return _omitXmlDeclaration;
    }


    /**
     * Sets XML declaration omitting on and off.
     *
     * @param omit True if XML declaration should be ommited
     */
    public void setOmitXMLDeclaration( boolean omit )
    {
        _omitXmlDeclaration = omit;
    }


    /**
     * Returns true if the document type is standalone.
     * The default is false.
     */
    public boolean getStandalone()
    {
        return _standalone;
    }


    /**
     * Sets document DTD standalone. The public and system
     * identifiers must be null for the document to be
     * serialized as standalone.
     *
     * @param standalone True if document DTD is standalone
     */
    public void setStandalone( boolean standalone )
    {
        _standalone = standalone;
    }


    /**
     * Returns a list of all the elements whose text node children
     * should be output as CDATA, or null if no such elements were
     * specified.
     */
    public String[] getCDataElements()
    {
        return _cdataElements;
    }


    /**
     * Returns true if the text node children of the given elements
     * should be output as CDATA.
     *
     * @param tagName The element's tag name
     * @return True if should serialize as CDATA
     */
    public boolean isCDataElement( String tagName )
    {
        int i;

        if ( _cdataElements == null )
            return false;
        for ( i = 0 ; i < _cdataElements.length ; ++i )
            if ( _cdataElements[ i ].equals( tagName ) )
                return true;
        return false;
    }


    /**
     * Sets the list of elements for which text node children
     * should be output as CDATA.
     *
     * @param cdataElements List of CDATA element tag names
     */
    public void setCDataElements( String[] cdataElements )
    {
        _cdataElements = cdataElements;
    }


    /**
     * Returns a list of all the elements whose text node children
     * should be output unescaped (no character references), or null
     * if no such elements were specified.
     */
    public String[] getNonEscapingElements()
    {
        return _nonEscapingElements;
    }


    /**
     * Returns true if the text node children of the given elements
     * should be output unescaped.
     *
     * @param tagName The element's tag name
     * @return True if should serialize unescaped
     */
    public boolean isNonEscapingElement( String tagName )
    {
        int i;

        if ( _nonEscapingElements == null ) {
            return false;
        }
        for ( i = 0 ; i < _nonEscapingElements.length ; ++i )
            if ( _nonEscapingElements[ i ].equals( tagName ) )
                return true;
        return false;
    }


    /**
     * Sets the list of elements for which text node children
     * should be output unescaped (no character references).
     *
     * @param nonEscapingElements List of unescaped element tag names
     */
    public void setNonEscapingElements( String[] nonEscapingElements )
    {
        _nonEscapingElements = nonEscapingElements;
    }



    /**
     * Returns a specific line separator to use. The default is the
     * Web line separator (\n). A string is returned to
     * support double codes (CR + LF).
     *
     * @return The specified line separator
     */
    public String getLineSeparator()
    {
        return _lineSeparator;
    }


    /**
     * Sets the line separator. The default is the Web line separator
     * (\n). The machine's line separator can be obtained
     * from the system property line.separator, but is only
     * useful if the document is edited on machines of the same type.
     * For general documents, use the Web line separator.
     *
     * @param lineSeparator The specified line separator
     */
    public void setLineSeparator( String lineSeparator )
    {
        if ( lineSeparator == null )
            _lineSeparator =  LineSeparator.Web;
        else
            _lineSeparator = lineSeparator;
    }


    /**
     * Returns true if the default behavior for this format is to
     * preserve spaces. All elements that do not specify otherwise
     * or specify the default behavior will be formatted based on
     * this rule. All elements that specify space preserving will
     * always preserve space.
     */
    public boolean getPreserveSpace()
    {
        return _preserve;
    }


    /**
     * Sets space preserving as the default behavior. The default is
     * space stripping and all elements that do not specify otherwise
     * or use the default value will not preserve spaces.
     *
     * @param preserve True if spaces should be preserved
     */
    public void setPreserveSpace( boolean preserve )
    {
        _preserve = preserve;
    }


    /**
     * Return the selected line width for breaking up long lines.
     * When indenting, and only when indenting, long lines will be
     * broken at space boundaries based on this line width.
     * No line wrapping occurs if this value is zero.
     */
    public int getLineWidth()
    {
        return _lineWidth;
    }


    /**
     * Sets the line width. If zero then no line wrapping will
     * occur. Calling {@link #setIndenting} will reset this
     * value to zero (off) or the default (on).
     *
     * @param lineWidth The line width to use, zero for default
     * @see #getLineWidth
     * @see #setIndenting
     */
    public void setLineWidth( int lineWidth )
    {
        if ( lineWidth <= 0 )
            _lineWidth = 0;
        else
            _lineWidth = lineWidth;
    }
	/**
	 * Returns the preserveEmptyAttribute flag. If flag is false, then'
	 * attributes with empty string values are output as the attribute 
	 * name only (in HTML mode).
	 * @return preserve the preserve flag
	 */	public boolean getPreserveEmptyAttributes () {		return _preserveEmptyAttributes;	}	/**
	 * Sets the preserveEmptyAttribute flag. If flag is false, then'
	 * attributes with empty string values are output as the attribute 
	 * name only (in HTML mode).
	 * @param preserve the preserve flag
	 */	public void setPreserveEmptyAttributes (boolean preserve) {		_preserveEmptyAttributes = preserve;	}

    /**
     * Returns the last printable character based on the selected
     * encoding. Control characters and non-printable characters
     * are always printed as character references.
     */
    public char getLastPrintable()
    {
        if ( getEncoding() != null &&
             ( getEncoding().equalsIgnoreCase( "ASCII" ) ) ) {
            return 0xFF;
        }
        return 0xFFFF;
    }


    /**
     * Determine the output method for the specified document.
     * If the document is an instance of {@link org.w3c.dom.html.HTMLDocument}
     * then the method is said to be html. If the root
     * element is 'html' and all text nodes preceding the root
     * element are all whitespace, then the method is said to be
     * html. Otherwise the method is xml.
     *
     * @param doc The document to check
     * @return The suitable method
     */
    public static String whichMethod( Document doc )
    {
        Node    node;
        String  value;
        int     i;

        // If document is derived from HTMLDocument then the default
        // method is html.
        if ( doc instanceof HTMLDocument )
            return Method.HTML;

        // Lookup the root element and the text nodes preceding it.
        // If root element is html and all text nodes contain whitespace
        // only, the method is html.

        // FIXME (SM) should we care about namespaces here?

        node = doc.getFirstChild();
        while (node != null) {
            // If the root element is html, the method is html.
            if ( node.getNodeType() == Node.ELEMENT_NODE ) {
                if ( node.getNodeName().equalsIgnoreCase( "html" ) ) {
                    return Method.HTML;
                } else if ( node.getNodeName().equalsIgnoreCase( "root" ) ) {
                    return Method.FOP;
                } else {
                    return Method.XML;
                }
            } else if ( node.getNodeType() == Node.TEXT_NODE ) {
                // If a text node preceding the root element contains
                // only whitespace, this might be html, otherwise it's
                // definitely xml.
                value = node.getNodeValue();
                for ( i = 0 ; i < value.length() ; ++i )
                    if ( value.charAt( i ) != 0x20 && value.charAt( i ) != 0x0A &&
                         value.charAt( i ) != 0x09 && value.charAt( i ) != 0x0D )
                        return Method.XML;
            }
            node = node.getNextSibling();
        }
        // Anything else, the method is xml.
        return Method.XML;
    }


    /**
     * Returns the document type public identifier
     * specified for this document, or null.
     */
    public static String whichDoctypePublic( Document doc )
    {
        DocumentType doctype;

           /*  DOM Level 2 was introduced into the code base*/
           doctype = doc.getDoctype();
           if ( doctype != null ) {
           // Note on catch: DOM Level 1 does not specify this method
           // and the code will throw a NoSuchMethodError
           try {
           return doctype.getPublicId();
           } catch ( Error except ) {  }
           }
        
        if ( doc instanceof HTMLDocument )
            return DTD.XHTMLPublicId;
        return null;
    }


    /**
     * Returns the document type system identifier
     * specified for this document, or null.
     */
    public static String whichDoctypeSystem( Document doc )
    {
        DocumentType doctype;

        /* DOM Level 2 was introduced into the code base*/
           doctype = doc.getDoctype();
           if ( doctype != null ) {
           // Note on catch: DOM Level 1 does not specify this method
           // and the code will throw a NoSuchMethodError
           try {
           return doctype.getSystemId();
           } catch ( Error except ) { }
           }
        
        if ( doc instanceof HTMLDocument )
            return DTD.XHTMLSystemId;
        return null;
    }


    /**
     * Returns the suitable media format for a document
     * output with the specified method.
     */
    public static String whichMediaType( String method )
    {
        if ( method.equalsIgnoreCase( Method.XML ) )
            return "text/xml";
        if ( method.equalsIgnoreCase( Method.HTML ) )
            return "text/html";
        if ( method.equalsIgnoreCase( Method.XHTML ) )
            return "text/html";
        if ( method.equalsIgnoreCase( Method.TEXT ) )
            return "text/plain";
        if ( method.equalsIgnoreCase( Method.FOP ) )
            return "application/pdf";
        return null;
    }


}