org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-benchmark Show documentation
Show all versions of lucene-benchmark Show documentation
Apache Lucene (module: benchmark)
package org.apache.lucene.benchmark.byTask.feeds;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Simple HTML Parser extracting title, meta tags, and body text
* that is based on NekoHTML.
*/
public class DemoHTMLParser implements HTMLParser {
/** The actual parser to read HTML documents */
public static final class Parser {
public final Properties metaTags = new Properties();
public final String title, body;
public Parser(Reader reader) throws IOException, SAXException {
this(new InputSource(reader));
}
public Parser(InputSource source) throws IOException, SAXException {
final SAXParser parser = new SAXParser();
parser.setFeature("http://xml.org/sax/features/namespaces", true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
final StringBuilder title = new StringBuilder(), body = new StringBuilder();
final DefaultHandler handler = new DefaultHandler() {
private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
if (inHEAD > 0) {
if (equalsIgnoreTurkish("title", localName)) {
inTITLE++;
} else {
if (equalsIgnoreTurkish("meta", localName)) {
String name = atts.getValue("name");
if (name == null) {
name = atts.getValue("http-equiv");
}
final String val = atts.getValue("content");
if (name != null && val != null) {
metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
}
}
}
} else if (inBODY > 0) {
if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed++;
} else if (equalsIgnoreTurkish("img", localName)) {
// the original javacc-based parser preserved
// attribute as body text in [] parenthesis:
final String alt = atts.getValue("alt");
if (alt != null) {
body.append('[').append(alt).append(']');
}
}
} else if (equalsIgnoreTurkish("body", localName)) {
inBODY++;
} else if (equalsIgnoreTurkish("head", localName)) {
inHEAD++;
} else if (equalsIgnoreTurkish("frameset", localName)) {
throw new SAXException("This parser does not support HTML framesets.");
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (inBODY > 0) {
if (equalsIgnoreTurkish("body", localName)) {
inBODY--;
} else if (ENDLINE_ELEMENTS.contains(localName)) {
body.append('\n');
} else if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed--;
}
} else if (inHEAD > 0) {
if (equalsIgnoreTurkish("head", localName)) {
inHEAD--;
} else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
inTITLE--;
}
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inBODY > 0 && suppressed == 0) {
body.append(ch, start, length);
} else if (inTITLE > 0) {
title.append(ch, start, length);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
// disable network access caused by DTDs
return new InputSource(new StringReader(""));
}
};
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
parser.parse(source);
// the javacc-based parser trimmed title (which should be done for HTML in all cases):
this.title = title.toString().trim();
// assign body text
this.body = body.toString();
}
// TODO: remove the Turkish workaround once this is fixed in NekoHTML:
// https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
// BEGIN: workaround
static final String convertTurkish(String s) {
return s.replace('i', 'ı');
}
static final boolean equalsIgnoreTurkish(String s1, String s2) {
final int len1 = s1.length(), len2 = s2.length();
if (len1 != len2)
return false;
for (int i = 0; i < len1; i++) {
char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
if (ch1 == 'ı') ch1 = 'i';
if (ch2 == 'ı') ch2 = 'i';
if (ch1 != ch2)
return false;
}
return true;
}
// END: workaround
static final Set createElementNameSet(String... names) {
final HashSet set = new HashSet();
for (final String name : names) {
set.add(name);
set.add(convertTurkish(name));
}
return Collections.unmodifiableSet(set);
}
/** HTML elements that cause a line break (they are block-elements) */
static final Set ENDLINE_ELEMENTS = createElementNameSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
"noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
);
/** HTML elements with contents that are ignored */
static final Set SUPPRESS_ELEMENTS = createElementNameSet(
"style", "script"
);
}
@Override
public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
try {
return parse(docData, name, date, new InputSource(reader), trecSrc);
} catch (SAXException saxe) {
throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
}
}
public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
final Parser p = new Parser(source);
// properties
final Properties props = p.metaTags;
String dateStr = props.getProperty("date");
if (dateStr != null) {
final Date newDate = trecSrc.parseDate(dateStr);
if (newDate != null) {
date = newDate;
}
}
docData.clear();
docData.setName(name);
docData.setBody(p.body);
docData.setTitle(p.title);
docData.setProps(props);
docData.setDate(date);
return docData;
}
}