All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sourceforge.tink.model.impl.TinkParser Maven / Gradle / Ivy

/**
 * Copyright 2008,2009 Ivan SZKIBA
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * under the License.
 */

package net.sourceforge.tink.model.impl;

import net.sourceforge.tink.model.FileObject;
import net.sourceforge.tink.model.Page;
import net.sourceforge.tink.model.TinkContext;
import net.sourceforge.tink.model.TinkException;

import org.cyberneko.html.parsers.DOMParser;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import java.util.Date;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;

public class TinkParser extends AbstractTinkComponent
{
    private static final String PARSE_ERROR_MESSAGE = "Error parsing file: ";
    private static final String DOM_CONFIG_ERROR = "DOM configuration problem";
    private static final String PARENT_DIR = "..";
    private static final char PATH_SEPARATOR = '/';
    private DocumentBuilder builder;
    private DOMParser parser;
    private XPath xpath;

    @Override public void init(TinkContext context) throws TinkException
    {
        super.init(context);
        setXpath(XPathFactory.newInstance().newXPath());
        setParser(new DOMParser());
        try
        {
            setBuilder(DocumentBuilderFactory.newInstance().newDocumentBuilder());
        }
        catch (ParserConfigurationException x)
        {
            throw new TinkException(DOM_CONFIG_ERROR, x);
        }
    }

    public Document parseData(FileObject input) throws TinkException
    {
        try
        {
            return getBuilder().parse(input);
        }
        catch (Exception x)
        {
            throw new TinkException(PARSE_ERROR_MESSAGE + input.getAbsolutePath(), x);
        }
    }

    public Page parsePage(FileObject input) throws TinkException
    {
        try
        {
            DOMParser p = newDOMParser();

            p.parse(new InputSource(input.openReader()));

            return build(input, p.getDocument());
        }
        catch (Exception x)
        {
            throw new TinkException(PARSE_ERROR_MESSAGE + input.getAbsolutePath(), x);
        }
    }

    protected DocumentBuilder getBuilder()
    {
        return builder;
    }

    protected void setBuilder(DocumentBuilder value)
    {
        this.builder = value;
    }

    protected DOMParser getParser()
    {
        return parser;
    }

    protected void setParser(DOMParser value)
    {
        this.parser = value;
    }

    protected XPath getXpath()
    {
        return xpath;
    }

    protected void setXpath(XPath value)
    {
        this.xpath = value;
    }

    protected Page build(FileObject input, Document doc) throws XPathException
    {
        Page page = new Page();

        page.setLastModified(new Date(input.lastModified()));
        page.setDocument(doc);
        page.setPath(input.getRelativePath());
        page.setTop(path2top(page.getPath()));
        Object o = getXpath().evaluate("/html/head", doc, XPathConstants.NODE);

        if (o != null)
        {
            page.setHead((Element) o);
        }

        o = getXpath().evaluate("/html/body", doc, XPathConstants.NODE);
        if (o != null)
        {
            page.setBody((Element) o);
        }

        o = getXpath().evaluate("/html/head/title", doc, XPathConstants.NODE);
        if (o != null)
        {
            page.setTitle(((Element) o).getTextContent());
        }
        else
        {
            page.setTitle(getMeta(doc, "title"));
        }

        return page;
    }

    protected String path2top(String path)
    {
        StringBuilder buff = new StringBuilder();

        for (int idx = path.indexOf(PATH_SEPARATOR, 1); idx >= 0; idx = path.indexOf(PATH_SEPARATOR, idx + 1))
        {
            buff.append(PARENT_DIR);
            buff.append(PATH_SEPARATOR);
        }

        return buff.toString();
    }

    private String getMeta(Document doc, String name) throws XPathException
    {
        Object o = getXpath().evaluate("/html/head/meta[@name='" + name + "']", doc, XPathConstants.NODE);

        return (o == null) ? null : ((Element) o).getAttribute("content");
    }

    private DOMParser newDOMParser() throws TinkException
    {
        getParser().reset();
        try
        {
            getParser().setFeature("http://xml.org/sax/features/namespaces", false);
            getParser().setFeature("http://cyberneko.org/html/features/scanner/fix-mswindows-refs", false);
            getParser().setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            getParser().setProperty("http://apache.org/xml/properties/dom/document-class-name", "org.apache.xerces.dom.DocumentImpl");
        }
        catch (SAXException x)
        {
            throw new TinkException(DOM_CONFIG_ERROR, x);
        }

        return getParser();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy