All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.XmlModule Maven / Gradle / Ivy

/**********************************************************************
 * Jhove - JSTOR/Harvard Object Validation Environment
 * Copyright 2004-2007 by JSTOR and the President and Fellows of Harvard College

 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 **********************************************************************/

package edu.harvard.hul.ois.jhove.module;

import java.io.*;
import java.util.*;
import edu.harvard.hul.ois.jhove.*;
import edu.harvard.hul.ois.jhove.module.xml.*;
import edu.harvard.hul.ois.jhove.module.html.HtmlMetadata;
import edu.harvard.hul.ois.jhove.module.html.DTDMapper;

import org.xml.sax.XMLReader;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.*;

/**
 *  Module for identification and validation of XML files.
 *  @author Gary McGath
 */
public class XmlModule
    extends ModuleBase
{
    /******************************************************************
     * PRIVATE CLASS FIELDS.
     ******************************************************************/

    private static final String NAME = "XML-hul";
    private static final String RELEASE = "1.4";
    private static final int [] DATE = {2007, 1, 8};
    private static final String [] FORMAT = {
        "XML", "XHTML"
    };
    private static final String COVERAGE = "XML 1.0";
    /* According to RFC 3023, text/xml should be used for human-readable
     * XML documents, and application/xml should be used for documents
     * that aren't easily read by humans.  Since that determination
     * is beyond the scope of this project, we err on the side of 
     * pessimism and use application/xml as the primary MIME type.
     * MIMETYPE[2] is only for XHTML. */
    private static final String [] MIMETYPE = {
	"text/xml", "application/xml", "text/html"
    };
    private static final String WELLFORMED = "An XML file is well-formed if " +
	"it meets the criteria defined in Section 2.1 of the XML " +
	"specification (W3C Recommendation, 3rd edition, 2004-02-04)";
    private static final String VALIDITY = "An XML file is valid if " +
	"well-formed, and the file has an associated DTD or XML Schema and " +
	"the file meets the constraints defined by that DTD or Schema";
    private static final String REPINFO = "Additional representation " +
	"information includes: version, endcoding, standalone flag, DTD or " +
	"schema, namespaces, notations, character references, entities, " +
	"processing instructions, and comments";
    private static final String NOTE = "This module determines " +
	"well-formedness and validity using the SAX2-conforming parser " +
	"specified by the invoking application";
    private static final String RIGHTS = "Copyright 2004-2007 by JSTOR and " +
	"the President and Fellows of Harvard College. " +
	"Released under the GNU Lesser General Public License.";

    /******************************************************************
     * PRIVATE INSTANCE FIELDS.
     ******************************************************************/

    /* Checksummer object */
    protected Checksummer _ckSummer;
    
    /* Input stream wrapper which handles checksums */
    protected ChecksumInputStream _cstream;
    
    /* Data input stream wrapped around _cstream */
    protected DataInputStream _dstream;

    /* Top-level property list. */
    protected List _propList;
    
    /* Top-level property. */
    protected Property _metadata;
    
    /* Doctype for XHTML documents only, otherwise null. */
    protected String _xhtmlDoctype;
    
    /* Base URL for DTD's.  If null, all DTD URL's are absolute. */
    protected String _baseURL;
    
    /* Flag to control signature checking behavior. If true,
     * checkSignatures insists on an XML document declaration; if
     * false, it will parse the file if there is no document
     * declaration.
     */
    protected boolean _sigWantsDecl;
    
    /* Flag to indicate we're invoking the parser from checkSignatures.
     * When true, it's up to checkSignatures to mark a signature as present.
     */
    protected boolean _parseFromSig;

    /* Flag to know if the property TextMDMetadata is to be added */
    protected boolean _withTextMD = false;
    /* Hold the information needed to generate a textMD metadata fragment */
    protected TextMDMetadata _textMD;
    
    /* Map from URIs to locally stored schemas */
    protected Map _localSchemas;
    
    /******************************************************************
    * CLASS CONSTRUCTOR.
    ******************************************************************/
   /**
     *  Instantiate an XmlModule object.
     */
   public XmlModule ()
   {
       super (NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED,
              VALIDITY, REPINFO, NOTE, RIGHTS, false);

       _vendor = Agent.harvardInstance();

       Document doc = new Document ("Extensible Markup Language (XML) 1.0 " +
				    "(Third Edition)", DocumentType.REPORT);
       doc.setPublisher (Agent.newW3CInstance());
       doc.setDate ("2004-02-04");
       doc.setIdentifier (new Identifier ("http://www.w3.org/TR/REC-xml",
                                          IdentifierType.URL));
       _specification.add (doc);

       doc = new Document ("SAX", DocumentType.WEB);
       doc.setIdentifier (new Identifier ("http://sax.sourceforge.net/",
                                          IdentifierType.URL));
       _specification.add (doc);

       Signature sig = new ExternalSignature (".xml", SignatureType.EXTENSION,
                                    SignatureUseType.OPTIONAL);
       _signature.add (sig);
       _localSchemas = new HashMap ();
   }

   /**  Sets the value of the doctype string, assumed to have been forced
    *   to upper case.  This is set only when the HTML module invokes the
    *   XML module for an XHTML document. */
   public void setXhtmlDoctype (String doctype)
   {
       _xhtmlDoctype = doctype;
       if (_textMD != null) {
           _textMD.setMarkup_language(_xhtmlDoctype);
       }
   }

    /** Reset parameter settings.
     *  Returns to a default state without any parameters.
     */
    @Override
    public void resetParams ()
        throws Exception
    {
        _baseURL = null;
        _sigWantsDecl = false;
        _parseFromSig = false;
    }

    /**
     * Per-action initialization.
     *
     * @param	param The module parameter; under command-line Jhove, the -p parameter.
     *        If the parameter starts with "schema", then the part to the
     *        right of the equal sign identifies a URI with a local path
     *        (URI, then semicolon, then path).
     *        If the first character is 's' and the parameter isn't "schema", 
     *        then signature checking requires
     *        a document declaration, and the rest of the URL is considered
     *        as follows.
     *        If the parameter begins with 'b' or 'B', then the remainder of
     *        the parameter is used as a base URL.  Otherwise it is ignored,
     *        and there is no base URL.
     */
    @Override
    public void param (String param)
    {
        if (param != null) {
            param = param.toLowerCase ();
            if (param.startsWith("schema=")) {
                addLocalSchema(param);
            }
            else if (param.indexOf ('s') == 0) {
                _sigWantsDecl = true;
                param = param.substring(1);
            }
            else if (param.indexOf ('b') == 0) {
                _baseURL = param.substring (1);
            }
        }
    }


   /**
    *   Parse the content of a purported XML digital object and store the
    *   results in RepInfo.
    * 
    *   This is designed to be called in two passes.  On the first pass,
    *   a nonvalidating parse is done.  If this succeeds, and the presence
    *   of DTD's or schemas is detected, then parse returns 1 so that it
    *   will be called again to do a validating parse.  If there is nothing
    *   to validate, we consider it "valid."
    *
     *   @param stream    An InputStream, positioned at its beginning,
     *                    which is generated from the object to be parsed.
     *                    If multiple calls to parse are made 
     *                    on the basis of a nonzero value being returned,
     *                    a new InputStream must be provided each time.
     * 
     *   @param info      A fresh (on the first call) RepInfo object 
     *                    which will be modified
     *                    to reflect the results of the parsing
     *                    If multiple calls to parse are made 
     *                    on the basis of a nonzero value being returned, 
     *                    the same RepInfo object should be passed with each
     *                    call.
     *
     *   @param parseIndex  Must be 0 in first call to parse.  If
     *                    parse returns a nonzero value, it must be
     *                    called again with parseIndex 
     *                    equal to that return value.
    */
    @Override
    public int parse (InputStream stream, RepInfo info, int parseIndex)
       throws IOException
    {
        // Test if textMD is to be generated

        if (_defaultParams != null) {
            _withTextMD = false;
            Iterator iter = _defaultParams.iterator ();
            while (iter.hasNext ()) {
                String param = iter.next ();
                if ("withtextmd=true".equalsIgnoreCase(param)) {
                    _withTextMD = true;
                }
            }
        }
        
        boolean canValidate = true;
        initParse ();
        info.setFormat (_format[0]);
        info.setMimeType (_mimeType[0]);
        info.setModule (this);
        if (_textMD == null || parseIndex == 0) {
            _textMD = new TextMDMetadata();
            _xhtmlDoctype = null;
        }
        
        /* We may have already done the checksums while converting a
           temporary file. */
        _ckSummer = null;
        if (_je != null && _je.getChecksumFlag () &&
            info.getChecksum ().isEmpty()) {
            _ckSummer = new Checksummer ();
        }
        _cstream = new ChecksumInputStream (stream, _ckSummer);

        _propList = new LinkedList ();
        _metadata = new Property ("XMLMetadata",
                PropertyType.PROPERTY,
                PropertyArity.LIST,
                _propList);

        XMLReader parser = null;
        InputSource src = null;
        XmlModuleHandler handler = null;
        XmlLexicalHandler lexHandler = new XmlLexicalHandler ();
        XmlDeclHandler declHandler = new XmlDeclHandler ();
        
        // The XmlDeclStream filters the characters, looking for an
        // XML declaration,  since there's no way to get that info
        // out of SAX.
        XmlDeclStream xds = new XmlDeclStream (_cstream);
        try {
            // Create an InputSource to feed the parser.
            // If a SAX class was specified, use it, otherwise use
            // the default parser.
            src = new InputSource (xds);
            // setSystemId may be helpful in resolving relative URI's,
            // though its use is unclear.  Its actual content is merely
            // informative, not a part of any actual link
            //src.setSystemId ("http://hul.harvard.edu/hul");
            if (_baseURL != null) {
                src.setSystemId(new File(_baseURL).toURI().toURL().toString());
            }
            String saxClass = _je.getSaxClass();
            if (saxClass == null) {
                SAXParserFactory factory = 
                                SAXParserFactory.newInstance();
                factory.setNamespaceAware (true);
                parser = factory.newSAXParser ().getXMLReader ();
            }
            else {
                parser = XMLReaderFactory.createXMLReader (saxClass);
            }
            handler = new XmlModuleHandler ();
            handler.setXhtmlFlag (_xhtmlDoctype != null);
            handler.setLocalSchemas (_localSchemas);
            parser.setContentHandler (handler);
            parser.setErrorHandler (handler);
            parser.setEntityResolver (handler);
            parser.setDTDHandler (handler);
            try {
                parser.setProperty 
                    ("http://xml.org/sax/properties/lexical-handler",
                     lexHandler);
            }
            catch (SAXException e) {
                info.setMessage (new InfoMessage 
                        ("The XML implementation in use does not " +
                        "support the LexicalHandler interface. " +
                        "This may result in some properties not being reported."));
            }
            try {
                parser.setProperty 
                    ("http://xml.org/sax/properties/declaration-handler",
                     declHandler);
            }
            catch (SAXException e) {
                info.setMessage (new InfoMessage 
                        ("The XML implementation in use does not " +
                        "support the DeclHandler interface. " +
                        "This may result in some properties not being reported."));
            }
            
        }
        catch (Exception f) {
            info.setMessage(new ErrorMessage (f.getMessage()));
            info.setWellFormed (false);  // actually not the file's fault
            return 0;
        }
        try {
            // On the first pass, we parse without validation.
            parser.setFeature ("http://xml.org/sax/features/validation",
                parseIndex == 0 ? false : true);
        }
        catch (SAXException se) {
            if (parseIndex != 0) {
                info.setMessage (new InfoMessage 
                    ("The SAX parser is not capable of validation."));
            }
            canValidate = false;
        }
        try {
            parser.setFeature ("http://xml.org/sax/features/namespaces",
                true);
        }
        catch (SAXException se) {
            info.setMessage (new InfoMessage 
                ("The SAX parser does not support namespaces."));
        }
        // This property for supporting schemas is a JAXP 1.2
        // recommendation, not likely to be supported widely as
        // of this (February 2004) writing, and not supported in
        // standard Crimson.  But it looks like the way to prepare
        // for schema validation in the future, and at least the
        // info message will tell users why they're getting bogus
        // invalid status.
        
        // Try 2 different ways of setting schema validation;
        // it appears that no one way works for all parsers.
        if (parseIndex > 0) {
            try {
                parser.setFeature("http://apache.org/xml/features/validation/schema",
                        true);
            }
            catch (SAXException ee) {
                try {
                    parser.setProperty 
                        ("http://java.sun.com/xml/jaxp/properties/schemaLanguage",
                         "http://www.w3.org/2001/XMLSchema");
                }
                catch (SAXException e) {
                    info.setMessage (new InfoMessage 
                            ("The XML implementation in use does not " +
                            "support schema language identification.  This " +
                            "may result in documents specified by schemas " +
                            "being reported as invalid."));
                }
            }
        }
        try {
            parser.parse (src);
        }
        catch (FileNotFoundException ef) {
            // Make this particular exception a little more user-friendly
            info.setMessage (new ErrorMessage
                    ("File not found",
                     ef.getMessage ().toString ()));
            info.setWellFormed (false);
            return 0;
        }
        catch (UTFDataFormatException u) {
            if (handler.getSigFlag () && !_parseFromSig) {
                info.setSigMatch(_name);
            }
            info.setMessage (new ErrorMessage ("Invalid character encoding"));
            info.setWellFormed (false);
            return 0;
        }
        catch (IOException e) {
            // We may get an IOException from trying to resolve an
            // external entity.
            if (handler.getSigFlag () && !_parseFromSig) {               
                info.setSigMatch(_name);
            }
            info.setMessage (new ErrorMessage
                    (e.getClass().getName() + ": " + 
                     e.getMessage ().toString ()));
            info.setWellFormed (false);
            return 0;
        }
        catch (SAXParseException e) {
            // Document failed to parse.
            if (handler.getSigFlag () && !_parseFromSig) {
                info.setSigMatch(_name);
            }
            int line = e.getLineNumber();
            int col = e.getColumnNumber();
            info.setMessage (new ErrorMessage 
                    (e.getMessage ().toString (),
                     "Line = " + line + ", Column = " + col));
            info.setWellFormed (false);
            return 0;
        }
        catch (SAXException e) {
            // Other SAX error.
            if (handler.getSigFlag ()) {
                info.setSigMatch(_name);
            }
            // Sometimes the message will be null and another message
            // wrapped inside it.  Try to report that.
            String msg = e.getMessage ();
            if (msg == null) {
                Throwable ee = e.getCause();
                if (ee != null) {
                    msg = "SAXException, cause = " +
                        ee.getClass().getName();
                }
                else {
                    msg = "Unspecified SAXException";
                }
            }
            info.setMessage (new ErrorMessage (msg));
            info.setWellFormed (false);
            return 0;
        }
        
        // Check if user has aborted
        if (_je.getAbort ()) {
            return 0;
        }
        
        if (handler.getSigFlag () && parseIndex == 0) {
            info.setSigMatch(_name);
        }
        // If it's the first pass, check if we found a DTD
        // or schema.
        // If so, reparse with validation enabled.
        // (Validation with schemas may prove futile, as the
        // Crimson parser understands only DTD and DOCTYPE
        // declarations as contributing to validity.)
        String dtdURI = handler.getDTDURI ();
        List schemaList = handler.getSchemas ();
        
        // In order to find the "primary" markup language, we try 3 things :
        // 1/ first, the first NamespaceURI
        // 3/ then, the first SchemaLocation
        // 1/ finally, the dtd URI
        // It should be noted that latter on when we look at the namespace in relation with the Root element
        // if a URI is defined with it, it will get the preference ...
        if (!schemaList.isEmpty()) {
            SchemaInfo schItems = schemaList.get(0);
            // First NamespaceURI
            if (isNotEmpty(schItems.namespaceURI)) {
                _textMD.setMarkup_language(schItems.namespaceURI);
            // Then SchemaLocation
            } 
            else if (isNotEmpty(schItems.location)) {
                _textMD.setMarkup_language(schItems.location);
            }
        } 
        else if (isNotEmpty(dtdURI)) {
            _textMD.setMarkup_language(dtdURI);
        }
        
        if (parseIndex == 0) {
            if ((handler.getDTDURI () != null ||
                 !schemaList.isEmpty ()) &&
                canValidate) {
                return 1;
            }
            info.setValid (RepInfo.UNDETERMINED);
            // This may get downgraded to false, but won't
            // be upgraded to true.
        }
        
        // Take a deep breath.  We parsed it.  Now assemble the
        // properties.
        info.setProperty (_metadata);
        
        // If it's XHTML, add the HTML property.
        HtmlMetadata hMetadata = handler.getHtmlMetadata ();
        if (hMetadata != null) {
            info.setProperty (hMetadata.toProperty (_withTextMD?_textMD:null));
        }

        // Report the parser in a property.
        _propList.add (new Property ("Parser",
                        PropertyType.STRING,
                        parser.getClass().getName()));

        // Add the version property.  Give precedence to XHTML doctype.
        String vers = null;
        if (_xhtmlDoctype != null) {
            vers = DTDMapper.getXHTMLVersion(_xhtmlDoctype);
            _textMD.setMarkup_language_version(vers);
        }
        if (vers != null) {
            info.setVersion (vers);
        }
        else {
            vers = xds.getVersion ();
            if (vers != null) {
                info.setVersion (vers);
            }
        }
        _textMD.setMarkup_basis_version(vers);
        
        // Add the encoding property.
        String encoding = xds.getEncoding ();
        if (encoding == null) {
            // If no explicit encoding, use default (Bugzilla 136)
            encoding = "UTF-8";
        }
        _propList.add (new Property ("Encoding",
                        PropertyType.STRING,
                        encoding));
        
        _textMD.setCharset(encoding);
        String textMDEncoding = _textMD.getCharset();
        if (textMDEncoding.indexOf("UTF") != -1) {
            _textMD.setByte_order(
                    _bigEndian?TextMDMetadata.BYTE_ORDER_BIG:TextMDMetadata.BYTE_ORDER_LITTLE);
                _textMD.setByte_size("8");
                _textMD.setCharacter_size("variable");
        } 
        else {
            _textMD.setByte_order(
                    _bigEndian?TextMDMetadata.BYTE_ORDER_BIG:TextMDMetadata.BYTE_ORDER_LITTLE);
                _textMD.setByte_size("8");
                _textMD.setCharacter_size("1");
        }
        // CRLF from XmlDeclStream ...
        String lineEnd = xds.getKindOfLineEnd();
        if (lineEnd == null) {
            info.setMessage(new InfoMessage("Not able to determine type of end of line"));
            _textMD.setLinebreak(TextMDMetadata.NILL);
        } else if ("CR".equalsIgnoreCase(lineEnd)) {
            _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR);
        } else if ("LF".equalsIgnoreCase(lineEnd)) {
            _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF);
        } else if ("CRLF".equalsIgnoreCase(lineEnd)) {
            _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF);
        }
        
        // Add the standalone property.
        String sa = xds.getStandalone ();
        if (sa != null) {
            _propList.add (new Property ("Standalone",
                        PropertyType.STRING,
                        sa));
        }
        
        // Add the DTD property.
        if (dtdURI != null) {
            _propList.add (new Property ("DTD_URI",
                        PropertyType.STRING,
                        dtdURI));
        }

        if (!schemaList.isEmpty ()) {
            // Build a List of Properties, which will be the value
            // of the Schemas Property.
            List schemaPropList = new ArrayList (schemaList.size());
            ListIterator iter = schemaList.listIterator();
            // Iterate through all the schemas.
            while (iter.hasNext ()) {
                SchemaInfo schItems = iter.next ();
                // Build a Property (Schema) whose value is an array
                // of two Properties (NamespaceURI and SchemaLocation).
                Property [] schItemProps = new Property[2];
                schItemProps[0] = new Property ("NamespaceURI",
                        PropertyType.STRING,
                        schItems.namespaceURI);
                schItemProps[1] = new Property ("SchemaLocation",
                        PropertyType.STRING,
                        schItems.location);
                schemaPropList.add (new Property ("Schema",
                        PropertyType.PROPERTY,
                        PropertyArity.ARRAY,
                        schItemProps));
            }
            // Now put the list into a Property, which goes into
            // the metadata.
            Property prop = new Property ("Schemas",
                            PropertyType.PROPERTY,
                            PropertyArity.LIST,
                            schemaPropList);
            _propList.add (prop);
        }
        
        // Add the root element.
        String root = handler.getRoot ();
        String rootPrefix = null;
        if (root != null) {
            _propList.add (new Property ("Root",
                        PropertyType.STRING,
                        root));
            if ("html".equals (root)) {
                // Specify format as XHTML
                info.setFormat (_format[1]);
                // Set the version according to the doctype... how?

            }
            // Get the prefix of root
            int indexOfColon = root.indexOf(':');
            if (indexOfColon != -1) {
                rootPrefix = root.substring(0, indexOfColon);
            }
        }
        if (rootPrefix == null) {
            rootPrefix = "";
        }
        
        // Declare properties we're going to add.  They have
        // some odd interdependencies, so we create them all
        // and them add them in the right (specified) order.
        Property namespaceProp = null;
        Property notationsProp = null;
        Property charRefsProp = null;
        Property entitiesProp = null;
        Property procInstProp = null;
        Property commentProp = null;
        Property unicodeBlocksProp = null;
        
        Map ns = handler.getNamespaces ();
        if (!ns.isEmpty ()) {
            Set keys = ns.keySet ();
            List nsList = new ArrayList (keys.size());
            Iterator iter = keys.iterator();
            while (iter.hasNext ()) {
                String key =  iter.next ();
                String val = ns.get (key);
                Property [] supPropArr = new Property[2];
                supPropArr[0] = new Property ("Prefix",
                            PropertyType.STRING,
                            key);
                supPropArr[1] = new Property ("URI",
                            PropertyType.STRING,
                            val);
                Property onens = new Property ("Namespace",
                            PropertyType.PROPERTY,
                            PropertyArity.ARRAY,
                            supPropArr);
                nsList.add (onens); 
                
                // Try to find the namespace URI of root
                if (rootPrefix.equalsIgnoreCase(key) && isNotEmpty(val)) {
                    _textMD.setMarkup_language(val);
                }
            }
            namespaceProp = new Property ("Namespaces",
                            PropertyType.PROPERTY,
                            PropertyArity.LIST,
                            nsList);
        }
       
        // CharacterReferences property goes here.
        // Report as a list of 4-digit hexadecimal strings,
        // e.g., 003C, 04AA, etc.
        // Also build the Unicode blocks here.
        List refs = xds.getCharacterReferences ();
        if (!refs.isEmpty ()) {
            Utf8BlockMarker utf8BM = new Utf8BlockMarker ();
            List refList = new ArrayList (refs.size ());
            ListIterator iter = refs.listIterator ();
            while (iter.hasNext ()) {
                Integer refi = iter.next ();
                int refint = refi.intValue ();
                refList.add (intTo4DigitHex (refint));
                utf8BM.markBlock(refint);
            }
            charRefsProp = new Property 
                    ("CharacterReferences",
                     PropertyType.STRING,
                     PropertyArity.LIST,
                     refList);
            unicodeBlocksProp = 
                utf8BM.getBlocksUsedProperty("UnicodeCharRefBlocks");
        }
        
        // Entities property
        // External unparsed entities
        Set entNames = lexHandler.getEntityNames ();
        Set attributeVals = handler.getAttributeValues ();
        List entProps = new LinkedList ();
        List uent = handler.getUnparsedEntities ();
        List unparsedNotationNames = new LinkedList ();
        if (!uent.isEmpty ()) {
             ListIterator iter = uent.listIterator ();
            while (iter.hasNext ()) {
                // We check external parsed entities against
                // the list of attribute values which we've
                // accumulated.  If a parsed entity name matches an
                // attribute value, we assume it's used.
                String[] entarr = iter.next ();
                String name = entarr[0];
                if (nameInCollection (name, attributeVals)) {
                    // Add the notation name to the list 
                    // unparsedNotationNames, so we can use it
                    // in determining which notations are used.
                    unparsedNotationNames.add (entarr[3]);
                    List subPropList = new ArrayList (6);
                    subPropList.add( new Property ("Name",
                            PropertyType.STRING,
                            name));
                    subPropList.add (new Property ("Type",
                            PropertyType.STRING,
                            "External unparsed"));
                    subPropList.add( new Property ("PublicID",
                            PropertyType.STRING,
                            entarr[1]));
                    subPropList.add( new Property ("SystemID",
                            PropertyType.STRING,
                            entarr[2]));
                    subPropList.add( new Property ("NotationName",
                            PropertyType.STRING,
                            entarr[3]));
                    
                    entProps.add (new Property ("Entity",
                            PropertyType.PROPERTY,
                            PropertyArity.LIST,
                            subPropList));
                }
            }
        }
  
        // Internal entities
        List declEnts = declHandler.getInternalEntityDeclarations ();
        if (!declEnts.isEmpty ()) {
            ListIterator iter = declEnts.listIterator ();
            while (iter.hasNext ()) {
                String[] entarr =  iter.next ();
                String name = entarr[0];
                // include only if the entity was actually used
                if (nameInCollection (name, entNames)) {
                    List subPropList = new ArrayList (4);
                    subPropList.add (new Property ("Name",
                            PropertyType.STRING,
                            name));
                    subPropList.add (new Property ("Type",
                            PropertyType.STRING,
                            "Internal"));
                    subPropList.add (new Property ("Value",
                            PropertyType.STRING,
                            entarr[1]));
                    entProps.add (new Property ("Entity",
                        PropertyType.PROPERTY,
                        PropertyArity.LIST,
                        subPropList));
                }
            }
        }

        // External parsed entities
        declEnts = declHandler.getExternalEntityDeclarations ();
        if (!declEnts.isEmpty ()) {
            ListIterator iter = declEnts.listIterator ();
            while (iter.hasNext ()) {
                String[] entarr =  iter.next ();
                String name = entarr[0];
                // include only if the entity was actually used
                if (nameInCollection (name, entNames)) {
                    List subPropList = new ArrayList (4);
                    subPropList.add (new Property ("Name",
                            PropertyType.STRING,
                            name));
                    subPropList.add (new Property ("Type",
                            PropertyType.STRING,
                            "External parsed"));
                    if (entarr[1] != null) {
                        subPropList.add (new Property ("PublicID",
                            PropertyType.STRING,
                            entarr[1]));
                    }
                    if (entarr[2] != null) {
                        subPropList.add (new Property ("SystemID",
                            PropertyType.STRING,
                            entarr[2]));
                    }
                    
                    entProps.add (new Property ("Entity",
                        PropertyType.PROPERTY,
                        PropertyArity.LIST,
                        subPropList));
                }
            }
        }
        
        if (!entProps.isEmpty ()) {
            entitiesProp = new Property ("Entities",
                    PropertyType.PROPERTY,
                    PropertyArity.LIST,
                    entProps);
        }
        
        List pi = handler.getProcessingInstructions ();
        List piTargets = new LinkedList ();
        if (!pi.isEmpty()) {
            // Build a property, which consists of a list
            // of properties, each of which is an array of
            // two String properties, named Target and
            // Data respectively.
            List piPropList = new ArrayList (pi.size());
            ListIterator pii = pi.listIterator ();
            while (pii.hasNext ()) {
                ProcessingInstructionInfo pistr =  pii.next ();
                Property[] subPropArr = new Property[2];
                // Accumulate targets in a list, so we can tell
                // which Notations use them.
                // Wait a minute -- what we're doing here can't work!! TODO what's supposed to be happening?
                //piTargets.add (subPropArr[0]);
                subPropArr[0] = new Property ("Target",
                            PropertyType.STRING,
                            pistr.target);
                subPropArr[1] = new Property ("Data",
                            PropertyType.STRING,
                            pistr.data);
                piPropList.add(new Property ("ProcessingInstruction",
                            PropertyType.PROPERTY,
                            PropertyArity.ARRAY,
                            subPropArr));
            }
            procInstProp = new Property ("ProcessingInstructions",
                            PropertyType.PROPERTY,
                            PropertyArity.LIST,
                            piPropList);
        }

        // Notations property.  We list notations only if they're
        // "actually used," meaning that they designate either
        // the target of a processing instruction or the ndata
        // of an unparsed entry which is itself "actually used."
        List notations = handler.getNotations ();
        if (!notations.isEmpty ()) {
            List notProps = new ArrayList (notations.size ());
            ListIterator iter = notations.listIterator ();
            List subPropList = new ArrayList (3);
            while (iter.hasNext ()) {
                String[] notArray = iter.next();
                String notName = notArray[0];
                // Check for use of Notation before including 
                // TODO this is implemented wrong! Need to reinvestigate
                if (nameInCollection (notName, piTargets) ||
                    nameInCollection (notName, unparsedNotationNames)) {
                    // notArray has name, public ID, system ID
                    subPropList.add (new Property ("Name",
                                PropertyType.STRING,
                                notName));
                    if (notArray[1] != null) {
                        subPropList.add (new Property ("PublicID",
                                PropertyType.STRING,
                                notArray[1]));
                    }
                    if (notArray[2] != null) {
                        subPropList.add (new Property ("SystemID",
                                PropertyType.STRING,
                                notArray[2]));
                    }
                    notProps.add (new Property ("Notation",
                                PropertyType.PROPERTY,
                                PropertyArity.LIST,
                                subPropList));
                }
            }
            // Recheck emptiness in case only unprocessed notations were found
            if (!notProps.isEmpty()) {
                notationsProp = new Property ("Notations",
                        PropertyType.PROPERTY,
                        PropertyArity.LIST,
                        notProps);
            }
        } 

        // Now add all the properties we created.
        if (namespaceProp != null) {
            _propList.add (namespaceProp);
        }
        if (notationsProp != null) {
            _propList.add (notationsProp);
        }
        if (charRefsProp != null) {
            _propList.add (charRefsProp);
        }
        if (unicodeBlocksProp != null) {
            _propList.add (unicodeBlocksProp);
        }
        if (entitiesProp != null) {
            _propList.add (entitiesProp);
        }
        if (procInstProp != null) {
            _propList.add (procInstProp);
        }

        List comm = lexHandler.getComments ();
        if (!comm.isEmpty ()) {
            commentProp = new Property ("Comments",
                            PropertyType.STRING,
                            PropertyArity.LIST,
                            comm);
        }
        if (commentProp != null) {
            _propList.add (commentProp);
        }
        
        // Check if parse detected invalid XML
        if (!handler.isValid ()) {
            info.setValid (false);
        }

        if (info.getWellFormed () == RepInfo.TRUE) {
            if (_xhtmlDoctype != null) {
                info.setMimeType (_mimeType[2]);
            }
            else {
                info.setMimeType (_mimeType[0]);
            }
        }
        
        // Add any messages from the parse.
        List msgs = handler.getMessages ();
        ListIterator msgi = msgs.listIterator ();
        while (msgi.hasNext ()) {
            info.setMessage ((Message) msgi.next ());
        } 
        
        if (_withTextMD) {
            _textMD.setMarkup_basis(info.getFormat());
            _textMD.setMarkup_basis_version(info.getVersion());
            Property property = new Property ("TextMDMetadata",
                    PropertyType.TEXTMDMETADATA, PropertyArity.SCALAR, _textMD);
            _propList.add(property);
        }
        
        if (_ckSummer != null){
            info.setChecksum (new Checksum (_ckSummer.getCRC32 (), 
                        ChecksumType.CRC32));
            String value = _ckSummer.getMD5 ();
            if (value != null) {
                info.setChecksum (new Checksum (value, ChecksumType.MD5));
            }
            if ((value = _ckSummer.getSHA1 ()) != null) {
                info.setChecksum (new Checksum (value, ChecksumType.SHA1));
            }
        }
        if (info.getVersion () == null) {
            info.setVersion ("1.0");
            _textMD.setMarkup_basis_version("1.0");
        }
        return 0;
    }
    
    
    /**
     *  Check if the digital object conforms to this Module's
     *  internal signature information.
     *  
     *  XML is a particularly messy case; in general, there's no 
     *  even moderately good way to check "signatures" without parsing
     *  the whole file, since the document declaration is optional.
     *  We provide the user two choices, based on the "s" parameter.
     *  If 's' is the first character of the module parameter, then
     *  we look for an XML document declaration, and say there's no
     *  signature if it's missing. (This can reject well-formed
     *  XML files, though not valid ones.) Otherwise, if there's no
     *  document declaration, we parse the whole file.
     *
     *   @param file      A File object for the object being parsed
     *   @param stream    An InputStream, positioned at its beginning,
     *                    which is generated from the object to be parsed
     *   @param info      A fresh RepInfo object which will be modified
     *                    to reflect the results of the test
     */
    @Override
    public void checkSignatures (File file,
                InputStream stream, 
                RepInfo info) 
        throws IOException
    {
        _parseFromSig = false;
        info.setFormat (_format[0]);
        info.setMimeType (_mimeType[0]);
        info.setModule (this);
        String sigStr = "= sigStr.length()) {
                        info.setSigMatch(_name);
                        return;     // sig matches
                    }
                }
                else break;
            }
        }
        catch (IOException e) {
            info.setWellFormed (false);
            return;
        }
        if (_sigWantsDecl) {
            
            // No XML declaration, and it's manadatory according to the param.
            info.setWellFormed (false);
            return;
        }
        
        // No XML signature, but we're allowed to parse the file now.
        // This means rewinding back to the start of the file.
        int parseIndex = 1;
        _parseFromSig = true;    // we set the sig match ourselves
        while (parseIndex != 0) {
            stream.close ();
            stream = new FileInputStream (file);
            parseIndex = parse (stream, info, parseIndex);
        }
        if (info.getWellFormed() == RepInfo.TRUE) {
            info.setSigMatch (_name);
        }
    }
    
    
    
    @Override
    protected void initParse ()
    {
       super.initParse ();
//       if (_defaultParams != null) {
//           Iterator iter = _defaultParams.iterator ();
//           while (iter.hasNext ()) {
//               String param =  iter.next ();
//               if (param.toLowerCase ().startsWith("localschema=")) {
//                   addLocalSchema(param);
//               }
//           }
//       }
    }
    
    /* Checks if a String is .equals to any member of a Set of strings. */
    protected static boolean nameInCollection (String name, Collection coll) 
    {
        Iterator iter = coll.iterator ();
        while (iter.hasNext ()) {
            String s = iter.next ();
            if (name.equals (s)) {
                return true;
            }
        }
        return false;
    }
    
    /* Converts an int to a 4-digit hex value, e.g.,
     * 003F or F10A.  This is used for Character References. */
    protected static String intTo4DigitHex (int n)
    {
        StringBuffer buf = new StringBuffer(4);
        for (int i = 3; i >= 0; i--) {
            int d = (n >> (4 * i)) & 0XF;  // extract a nybble
            if (d < 10) {
                buf.append ((char) ('0' + d));
            }
            else {
                buf.append ((char) ('A' + (d - 10)));
            }
        }
        return buf.toString ();
    }
    
    /**
     * Verification that the string contains something usefull.
     * @param value string to test
     * @return boolean
     */
    protected static boolean isNotEmpty(String value) {
        return (
                (value != null) && 
                (value.length() != 0) && 
                !("[None]".equals(value))
         );
    }
    
    /**
     * Add a mapping from a schema URI to a local file.
     * The parameter is of the form schema=[URI];[path]
     */
    private void addLocalSchema (String param) {
        int eq = param.indexOf('=');
        int semi = param.indexOf(';');
        try {
            String uri = param.substring(eq+1, semi).trim();
            String path = param.substring(semi + 1).trim();
            File f = new File (path);
            if (f.exists()) {
                _localSchemas.put (uri, f);
            }
        }
        catch (Exception e) {}
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy