
net.sourceforge.tink.model.impl.TinkParser Maven / Gradle / Ivy
/**
* Copyright 2008,2009 Ivan SZKIBA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* under the License.
*/
package net.sourceforge.tink.model.impl;
import net.sourceforge.tink.model.FileObject;
import net.sourceforge.tink.model.Page;
import net.sourceforge.tink.model.TinkContext;
import net.sourceforge.tink.model.TinkException;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;
public class TinkParser extends AbstractTinkComponent
{
private static final String PARSE_ERROR_MESSAGE = "Error parsing file: ";
private static final String DOM_CONFIG_ERROR = "DOM configuration problem";
private static final String PARENT_DIR = "..";
private static final char PATH_SEPARATOR = '/';
private DocumentBuilder builder;
private DOMParser parser;
private XPath xpath;
@Override public void init(TinkContext context) throws TinkException
{
super.init(context);
setXpath(XPathFactory.newInstance().newXPath());
setParser(new DOMParser());
try
{
setBuilder(DocumentBuilderFactory.newInstance().newDocumentBuilder());
}
catch (ParserConfigurationException x)
{
throw new TinkException(DOM_CONFIG_ERROR, x);
}
}
public Document parseData(FileObject input) throws TinkException
{
try
{
return getBuilder().parse(input);
}
catch (Exception x)
{
throw new TinkException(PARSE_ERROR_MESSAGE + input.getAbsolutePath(), x);
}
}
public Page parsePage(FileObject input) throws TinkException
{
try
{
DOMParser p = newDOMParser();
p.parse(new InputSource(input.openReader()));
return build(input, p.getDocument());
}
catch (Exception x)
{
throw new TinkException(PARSE_ERROR_MESSAGE + input.getAbsolutePath(), x);
}
}
protected DocumentBuilder getBuilder()
{
return builder;
}
protected void setBuilder(DocumentBuilder value)
{
this.builder = value;
}
protected DOMParser getParser()
{
return parser;
}
protected void setParser(DOMParser value)
{
this.parser = value;
}
protected XPath getXpath()
{
return xpath;
}
protected void setXpath(XPath value)
{
this.xpath = value;
}
protected Page build(FileObject input, Document doc) throws XPathException
{
Page page = new Page();
page.setLastModified(new Date(input.lastModified()));
page.setDocument(doc);
page.setPath(input.getRelativePath());
page.setTop(path2top(page.getPath()));
Object o = getXpath().evaluate("/html/head", doc, XPathConstants.NODE);
if (o != null)
{
page.setHead((Element) o);
}
o = getXpath().evaluate("/html/body", doc, XPathConstants.NODE);
if (o != null)
{
page.setBody((Element) o);
}
o = getXpath().evaluate("/html/head/title", doc, XPathConstants.NODE);
if (o != null)
{
page.setTitle(((Element) o).getTextContent());
}
else
{
page.setTitle(getMeta(doc, "title"));
}
return page;
}
protected String path2top(String path)
{
StringBuilder buff = new StringBuilder();
for (int idx = path.indexOf(PATH_SEPARATOR, 1); idx >= 0; idx = path.indexOf(PATH_SEPARATOR, idx + 1))
{
buff.append(PARENT_DIR);
buff.append(PATH_SEPARATOR);
}
return buff.toString();
}
private String getMeta(Document doc, String name) throws XPathException
{
Object o = getXpath().evaluate("/html/head/meta[@name='" + name + "']", doc, XPathConstants.NODE);
return (o == null) ? null : ((Element) o).getAttribute("content");
}
private DOMParser newDOMParser() throws TinkException
{
getParser().reset();
try
{
getParser().setFeature("http://xml.org/sax/features/namespaces", false);
getParser().setFeature("http://cyberneko.org/html/features/scanner/fix-mswindows-refs", false);
getParser().setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
getParser().setProperty("http://apache.org/xml/properties/dom/document-class-name", "org.apache.xerces.dom.DocumentImpl");
}
catch (SAXException x)
{
throw new TinkException(DOM_CONFIG_ERROR, x);
}
return getParser();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy