All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Simple HTML Parser extracting title, meta tags, and body text that is based on NekoHTML.
 */
public class DemoHTMLParser implements HTMLParser {

  /** The actual parser to read HTML documents */
  public static final class Parser {

    public final Properties metaTags = new Properties();
    public final String title, body;

    public Parser(Reader reader) throws IOException, SAXException {
      this(new InputSource(reader));
    }

    public Parser(InputSource source) throws IOException, SAXException {
      final SAXParser parser = new SAXParser();
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");

      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
      final DefaultHandler handler =
          new DefaultHandler() {
            private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;

            @Override
            public void startElement(
                String namespaceURI, String localName, String qName, Attributes atts)
                throws SAXException {
              if (inHEAD > 0) {
                if ("title".equals(localName)) {
                  inTITLE++;
                } else {
                  if ("meta".equals(localName)) {
                    String name = atts.getValue("name");
                    if (name == null) {
                      name = atts.getValue("http-equiv");
                    }
                    final String val = atts.getValue("content");
                    if (name != null && val != null) {
                      metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
                    }
                  }
                }
              } else if (inBODY > 0) {
                if (SUPPRESS_ELEMENTS.contains(localName)) {
                  suppressed++;
                } else if ("img".equals(localName)) {
                  // the original javacc-based parser preserved ...
                  // attribute as body text in [] parenthesis:
                  final String alt = atts.getValue("alt");
                  if (alt != null) {
                    body.append('[').append(alt).append(']');
                  }
                }
              } else if ("body".equals(localName)) {
                inBODY++;
              } else if ("head".equals(localName)) {
                inHEAD++;
              } else if ("frameset".equals(localName)) {
                throw new SAXException("This parser does not support HTML framesets.");
              }
            }

            @Override
            public void endElement(String namespaceURI, String localName, String qName)
                throws SAXException {
              if (inBODY > 0) {
                if ("body".equals(localName)) {
                  inBODY--;
                } else if (ENDLINE_ELEMENTS.contains(localName)) {
                  body.append('\n');
                } else if (SUPPRESS_ELEMENTS.contains(localName)) {
                  suppressed--;
                }
              } else if (inHEAD > 0) {
                if ("head".equals(localName)) {
                  inHEAD--;
                } else if (inTITLE > 0 && "title".equals(localName)) {
                  inTITLE--;
                }
              }
            }

            @Override
            public void characters(char[] ch, int start, int length) throws SAXException {
              if (inBODY > 0 && suppressed == 0) {
                body.append(ch, start, length);
              } else if (inTITLE > 0) {
                title.append(ch, start, length);
              }
            }

            @Override
            public InputSource resolveEntity(String publicId, String systemId) {
              // disable network access caused by DTDs
              return new InputSource(new StringReader(""));
            }
          };

      parser.setContentHandler(handler);
      parser.setErrorHandler(handler);
      parser.parse(source);

      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
      this.title = title.toString().trim();

      // assign body text
      this.body = body.toString();
    }

    /** HTML elements that cause a line break (they are block-elements) */
    static final Set ENDLINE_ELEMENTS =
        Set.of(
            "p",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "div",
            "ul",
            "ol",
            "dl",
            "pre",
            "hr",
            "blockquote",
            "address",
            "fieldset",
            "table",
            "form",
            "noscript",
            "li",
            "dt",
            "dd",
            "noframes",
            "br",
            "tr",
            "select",
            "option");

    /** HTML elements with contents that are ignored */
    static final Set SUPPRESS_ELEMENTS = Set.of("style", "script");
  }

  @Override
  public DocData parse(
      DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc)
      throws IOException {
    try {
      return parse(docData, name, date, new InputSource(reader), trecSrc);
    } catch (SAXException saxe) {
      throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
    }
  }

  public DocData parse(
      DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc)
      throws IOException, SAXException {
    final Parser p = new Parser(source);

    // properties
    final Properties props = p.metaTags;
    String dateStr = props.getProperty("date");
    if (dateStr != null) {
      final Date newDate = trecSrc.parseDate(dateStr);
      if (newDate != null) {
        date = newDate;
      }
    }

    docData.clear();
    docData.setName(name);
    docData.setBody(p.body);
    docData.setTitle(p.title);
    docData.setProps(props);
    docData.setDate(date);
    return docData;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy