All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.html.HtmlMetadata Maven / Gradle / Ivy

/**********************************************************************
 * Jhove - JSTOR/Harvard Object Validation Environment
 * Copyright 2004-2009 by JSTOR and the President and Fellows of Harvard College
 **********************************************************************/

package edu.harvard.hul.ois.jhove.module.html;

import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;

import edu.harvard.hul.ois.jhove.Property;
import edu.harvard.hul.ois.jhove.PropertyArity;
import edu.harvard.hul.ois.jhove.PropertyType;
import edu.harvard.hul.ois.jhove.TextMDMetadata;
import edu.harvard.hul.ois.jhove.module.Utf8BlockMarker;

/**
 * Repository for an HTML document's metadata.
 * Also hold some state information, so that properties involving
 * tags, attributes and pcdata can be constructed.
 *
 * @author Gary McGath
 *
 */
public class HtmlMetadata {

    private String _title;
    private String _lang;
    private List _meta;
    private String _charset;
    private TreeSet _languages;
    private List _links;
    private List _images;
    private List _citations;
    private List _defs;
    private List _frames;
    private List _scripts;
    private List _abbrs;
    private TreeSet _entities;
    private Property _propUnderConstruction;

    /** Object for tracking UTF8 blocks. */
    private Utf8BlockMarker utf8BM;

    /** Constructor.  Initializes to the empty state. */    
    public HtmlMetadata ()
    {
        // Mostly sets variables to their defaults; it's good
        // documentation practice.  Lists are set to null until
        // there's actually something to add to them; this guarantees
        // that toProperty() doesn't have to deal with empty lists.
        _title = null;
        _lang = null;
        _meta = null;
        _charset = null;
        _links = null;
        _images = null;
        _citations = null;
        _defs = null;
        _frames = null;
        _scripts = null;
        _entities = null;
        _languages = null;
        _propUnderConstruction = null;
        utf8BM = new Utf8BlockMarker ();
    }
    
    /** Stores the contents of the TITLE element. */
    public void setTitle (String title)
    {
        _title = title;
    }
    
    /** Stores the language defined in the HTML element. */
    public void setLanguage (String lang)
    {
        _lang = lang;
    }
    
    /** Add a language defined in an attribute of any element
     *  except the HTML element. */
    public void addLanguage (String lang) 
    {
        if (!lang.equals(_lang)) {
            if (_languages == null) {
                _languages = new TreeSet ();
            }
            _languages.add (lang);
        }
    }
    
    /** Adds a CITE element's pcdata to the Citations property. */
    public void addCitation (String text)
    {
        if (_citations == null) {
            _citations = new LinkedList ();
        }
        _citations.add (text);
    }
    
    /** Adds a META tag's contents to the Meta property. */
    public void addMeta (Property prop) 
    {
        // We don't set _meta until there's a property;
        // thus, we guarantee it will never be an empty list.
        if (_meta == null) {
            _meta = new LinkedList ();
        }
        _meta.add (prop);
        
        // Is it a httpequiv=Content-Type ?
        String valContentType = extractHttpEquivValue(prop, "Content-Type");
        if (valContentType != null) {
            final String toSearch = "charset=";
            int indexOfCharset = valContentType.indexOf(toSearch);
            if (indexOfCharset != -1) {
                setCharset(valContentType.substring(indexOfCharset + toSearch.length()));
            }
        }
        // Is it a httpequiv=Content-Language ?
        String valContentLanguage = extractHttpEquivValue(prop, "Content-Language");
        if (valContentLanguage != null) {
            setLanguage(valContentLanguage);
        }
    }

    /**
     * Extract the content value associated with a given httpEquiv.
     * @param prop List containing the description of the meta tag
     * @param httpEquivValue the httpEquiv to consider
     * @return the content value
     */
    public String extractHttpEquivValue(Property prop, String httpEquivValue) {
        if (httpEquivValue == null) return null;
        String value = null;
        Property httpEquiv = prop.getByName("Httpequiv");
        if (httpEquiv != null &&
            PropertyArity.SCALAR.equals(httpEquiv.getArity()) &&
            PropertyType.STRING.equals(httpEquiv.getType())
        ) {
            String val = (String)httpEquiv.getValue();
            if (httpEquivValue.equalsIgnoreCase(val)) {
                // Look for charset in the Content property
                Property content = prop.getByName("Content");
                if (content != null &&
                    PropertyArity.SCALAR.equals(content.getArity()) &&
                    PropertyType.STRING.equals(content.getType())
                ) {
                    value = (String)content.getValue();
                }
            }
        }
        return value;
    }
    
    /** Stores the charset defined in the HTML element. */
    public void setCharset (String charset)
    {
        _charset = charset;
    }
    
    /** Adds a FRAME tag's contents to the Meta property. */
    public void addFrame (Property prop) 
    {
        // We don't set _frames until there's a property;
        // thus, we guarantee it will never be an empty list.
        if (_frames == null) {
            _frames = new LinkedList ();
        }
        _frames.add (prop);
    }
    
    /** Adds an ABBR tag's contents to the Meta property. */
    public void addAbbr (Property prop)
    {
        if (_abbrs == null) {
            _abbrs = new LinkedList ();
        }
        _abbrs.add (prop);
    }
    
    /** Adds a link to the Links property. */
    public void addLink (String link)
    {
        if (_links == null) {
            _links = new LinkedList ();
        }
        _links.add (link);
    }
    
    /** Adds an item to the Images property. */
    public void addImage (Property prop)
    {
        if (_images == null) {
            _images = new LinkedList ();
        }
        _images.add (prop);
    }
    
    /** Adds a defined term to the Defined Terms property. */
    public void addDef (String text)
    {
        if (_defs == null) {
            _defs = new LinkedList ();
        }
        _defs.add (text);
    }
    
    /** Adds the language of a SCRIPT element to the Scripts property. */
    public void addScript (String stype)
    {
        if (_scripts == null) {
            _scripts = new LinkedList ();
        }
        _scripts.add (stype);
    }
    
    /** Adds a String to the Entities property.  This property is a
     *  SortedSet, so duplicates are not added, and the resulting set
     *  can be iterated in alphabetical order. */
    public void addEntity (String entity)
    {
        if (_entities == null) {
            _entities = new TreeSet ();
        }
        _entities.add (entity);
    }
    
    /** Returns the UTF8BlockMarker for the metadata. */
    public Utf8BlockMarker getUtf8BlockMarker ()
    {
        return utf8BM;
    }
    
    /** Returns the contents of the TITLE element. */
    public String getTitle ()
    {
        return _title;
    }
    
    public String getCharset() {
        return _charset;
    }
    
    /** Converts the metadata to a Property. */
    public Property toProperty (TextMDMetadata _textMD)
    {
        List propList = new LinkedList ();
        Property val = new Property ("HTMLMetadata",
                                            PropertyType.PROPERTY,
                                            PropertyArity.LIST,
                                            propList);
        if (_lang != null) {
            propList.add (new Property ("PrimaryLanguage",
                    PropertyType.STRING,
                    _lang));
            if (_textMD != null) {
                _textMD.setLanguage(_lang);
            }
        }
        if (_languages != null) {
            propList.add (new Property ("OtherLanguages",
                    PropertyType.STRING,
                    PropertyArity.SET,
                    _languages));
        }
        if (_title != null) {
            propList.add (new Property ("Title",
                    PropertyType.STRING,
                    _title));
        }
        if (_meta != null) {
            // We're guaranteed that if _meta isn't null, it's non-empty.
            propList.add (new Property ("MetaTags",
                    PropertyType.PROPERTY,
                    PropertyArity.LIST,
                    _meta));
        }
        if (_frames != null) {
            propList.add (new Property ("Frames",
                    PropertyType.PROPERTY,
                    PropertyArity.LIST,
                    _frames));
        }
        if (_links != null) {
            propList.add (new Property ("Links",
                    PropertyType.STRING,
                    PropertyArity.LIST,
                    _links));
        }
        if (_scripts != null) {
            propList.add (new Property ("Scripts",
                    PropertyType.STRING,
                    PropertyArity.LIST,
                    _scripts));
        }
        if (_images != null) {
            propList.add (new Property("Images",
                    PropertyType.PROPERTY,
                    PropertyArity.LIST,
                    _images));
        }
        if (_citations != null) {
            propList.add (new Property("Citations",
                    PropertyType.STRING,
                    PropertyArity.LIST,
                    _citations));
        }
        if (_defs != null) {
            propList.add (new Property ("DefinedTerms",
                    PropertyType.STRING,
                    PropertyArity.LIST,
                    _defs));
        }
        if (_abbrs != null) {
            propList.add (new Property ("Abbreviations",
                    PropertyType.PROPERTY,
                    PropertyArity.LIST,
                    _abbrs));
        }
        if (_entities != null) {
            propList.add (new Property ("Entities",
                    PropertyType.STRING,
                    PropertyArity.SET,
                    _entities));
        }
        if (utf8BM != null) {
            Property p = utf8BM.getBlocksUsedProperty("UnicodeEntityBlocks");
            if (p != null) {
                propList.add (p);
            }
        }
        if (_textMD != null) {
             propList.add (new Property ("TextMDMetadata",
                     PropertyType.TEXTMDMETADATA, 
                     PropertyArity.SCALAR, 
                     _textMD));
        }
        
	if (propList.isEmpty ()) {
	    return null;
	}

        return val;
    }
    
    /** Sets a "property under construction".  This is generally
     *  called when an XML element is found, and the PCDATA must
     *  be incorporated into the property.
     */
    public void setPropUnderConstruction (Property p)
    {
        _propUnderConstruction = p;
    }
    
    /** Returns the "property under construction." */
    public Property getPropUnderConstruction ()
    {
        return _propUnderConstruction;
    }
    
    /** Adds PCDATA text to the property under construction.
     *  This may not all be provided in one lump, so it
     *  has to allow for multiple chunks. */
    public void addToPropUnderConstruction 
            (char[] ch, int start, int length)
    {
        if (_propUnderConstruction != null) {
            String argStr = new String (ch, start, length);
            String name = _propUnderConstruction.getName ();
            Object val = _propUnderConstruction.getValue ();
            if ("abbr".equals (name)) {
                // Theoretically, this can come in more than one
                // chunk, but a long abbreviation is moronic if
                // not oxymoronic.
                List propList = (List) _propUnderConstruction.getValue();
                Property abProp = new Property ("abbr",
                        PropertyType.STRING,
                        argStr);
                propList.add(0, abProp);
            }
            else if ("title".equals (name) ||
                     "dfn".equals (name)) {
                // For these properties, we just need to maintain
                // the String.  But to keep the design consistent and
                // simple, we maintain the Property and then just pull
                // out the String at the end.
                // A Property is immutable.  Rather than risk obscure
                // consequences from changing this assumption, we append
                // the text to a new Property.
                _propUnderConstruction = new Property (name,
                        PropertyType.STRING,
                        (String) val + argStr);
            }
        }
    }
    
    /** Finishes any property under construction.  This is called
     *  when an end element is encountered. */
    public void finishPropUnderConstruction ()
    {
        if (_propUnderConstruction != null) {
            String name = _propUnderConstruction.getName ();
            if ("abbr".equals(name)) {
                addAbbr (_propUnderConstruction);
            }
            else if ("title".equals (name)) {
                _title = (String) _propUnderConstruction.getValue ();
            }
            _propUnderConstruction = null;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy