
com.jaeksoft.searchlib.parser.htmlParser.HtmlCleanerParser Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.parser.htmlParser;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collection;
import javax.xml.parsers.ParserConfigurationException;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Logging;
public class HtmlCleanerParser extends HtmlDocumentProvider {
private final HtmlCleaner cleaner;
private TagNode rootTagNode = null;
private String charsetCache = null;
public HtmlCleanerParser() {
super(HtmlParserEnum.HtmlCleanerParser);
cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setNamespacesAware(true);
}
@Override
protected HtmlNodeAbstract> getDocument(String charset,
InputStream inputStream) throws SAXException, IOException,
ParserConfigurationException {
rootTagNode = cleaner.clean(inputStream, charset);
charsetCache = null;
return getDomHtmlNode();
}
@Override
protected HtmlNodeAbstract> getDocument(String pageSource)
throws IOException, ParserConfigurationException {
rootTagNode = cleaner.clean(new StringReader(pageSource));
charsetCache = null;
return getDomHtmlNode();
}
private DomHtmlNode getDomHtmlNode() throws ParserConfigurationException {
Document document = new DomSerializer(cleaner.getProperties(), true)
.createDOM(rootTagNode);
String lang = rootTagNode.getAttributeByName("lang");
if (lang != null)
document.getDocumentElement().setAttribute("lang", lang);
return new DomHtmlNode(document);
}
public String findCharset() {
if (charsetCache != null)
return charsetCache;
String charsetCache = getMetaCharset();
if (charsetCache == null)
return null;
try {
Charset.forName(charsetCache);
return charsetCache;
} catch (UnsupportedCharsetException e1) {
try {
charsetCache = charsetCache.toUpperCase();
Charset.forName(charsetCache);
return charsetCache;
} catch (UnsupportedCharsetException e2) {
Logging.warn(e2);
charsetCache = null;
return null;
}
}
}
public void writeHtmlToFile(File htmlFile) throws IOException {
SimpleHtmlSerializer htmlSerializer = new SimpleHtmlSerializer(
cleaner.getProperties());
String charset = findCharset();
if (charset != null)
htmlSerializer.writeToFile(rootTagNode, htmlFile.getAbsolutePath(),
charset);
else
htmlSerializer.writeToFile(rootTagNode, htmlFile.getAbsolutePath());
}
public TagNode getTagNode() {
return rootTagNode;
}
final public int xpath(String xPathExpression,
Collection tagNodeCollection) throws XPatherException {
if (xPathExpression.startsWith("/html"))
xPathExpression = xPathExpression.substring(5);
Object[] objects = rootTagNode.evaluateXPath(xPathExpression);
if (objects == null)
return 0;
for (Object object : objects)
tagNodeCollection.add((TagNode) object);
return objects.length;
}
@Override
public boolean isXPathSupported() {
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy