All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.cobraparser.html.parser.DocumentBuilderImpl Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
/*
    GNU LESSER GENERAL PUBLIC LICENSE
    Copyright (C) 2006 The Lobo Project

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    Contact info: [email protected]
 */
/*
 * Created on Oct 15, 2005
 */
package org.cobraparser.html.parser;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.logging.Logger;

import javax.xml.parsers.DocumentBuilder;

import org.cobraparser.html.HtmlRendererContext;
import org.cobraparser.html.domimpl.DOMImplementationImpl;
import org.cobraparser.html.domimpl.HTMLDocumentImpl;
import org.cobraparser.html.io.WritableLineReader;
import org.cobraparser.ua.UserAgentContext;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * The DocumentBuilderImpl class is an HTML DOM parser that
 * implements the standard W3C DocumentBuilder interface.
 *
 * @author J. H. S.
 */
public class DocumentBuilderImpl extends DocumentBuilder {
  private static final Logger logger = Logger.getLogger(DocumentBuilderImpl.class.getName());
  private EntityResolver resolver;
  private ErrorHandler errorHandler;
  private final UserAgentContext bcontext;
  private final HtmlRendererContext rcontext;

  /**
   * Constructs a DocumentBuilderImpl. This constructor should be
   * used when only the parsing functionality (without rendering) is required.
   *
   * @param context
   *          An instance of {@link org.cobraparser.html.UserAgentContext},
   *          which may be an instance of
   *          {@link org.cobraparser.html.test.SimpleUserAgentContext}.
   */
  public DocumentBuilderImpl(final UserAgentContext context) {
    this.rcontext = null;
    this.bcontext = context;
  }

  /**
   * Constructs a DocumentBuilderImpl. This constructor should be
   * used when rendering is expected.
   *
   * @param ucontext
   *          An instance of {@link org.cobraparser.html.UserAgentContext},
   *          which may be an instance of
   *          {@link org.cobraparser.html.test.SimpleUserAgentContext}.
   * @param rcontext
   *          An instance of {@link org.cobraparser.html.HtmlRendererContext},
   *          which may be an instance of
   *          {@link org.cobraparser.html.test.SimpleHtmlRendererContext}.
   */
  public DocumentBuilderImpl(final UserAgentContext ucontext, final HtmlRendererContext rcontext) {
    this.rcontext = rcontext;
    this.bcontext = ucontext;
  }

  /**
   * Constructs a DocumentBuilderImpl. This constructor should be
   * used when rendering is expected.
   *
   * @param rcontext
   *          An instance of {@link org.cobraparser.html.HtmlRendererContext},
   *          which may be an instance of
   *          {@link org.cobraparser.html.test.SimpleHtmlRendererContext}.
   */
  public DocumentBuilderImpl(final HtmlRendererContext rcontext) {
    this.rcontext = rcontext;
    this.bcontext = rcontext.getUserAgentContext();
  }

  /**
   * Parses an HTML document. Note that this method will read the entire input
   * source before returning a Document instance.
   *
   * @param is
   *          The input source, which may be an instance of
   *          {@link InputSourceImpl}.
   * @see #createDocument(InputSource)
   */
  @Override
  public Document parse(final InputSource is) throws SAXException, IOException {
    final HTMLDocumentImpl document = (HTMLDocumentImpl) this.createDocument(is, "");
    document.load();
    return document;
  }

  /**
   * Creates a document without parsing the input provided, so the document
   * object can be used for incremental rendering.
   *
   * @param is
   *          The input source, which may be an instance of
   *          {@link InputSourceImpl}. The input
   *          source must provide either an input stream or a reader.
   * @see HTMLDocumentImpl#load()
   */
  public Document createDocument(final InputSource is, final String contentType) throws SAXException, IOException {
    final String encoding = is.getEncoding();
    String charset = encoding;
    if (charset == null) {
      charset = "US-ASCII";
    }
    final String uri = is.getSystemId();
    if (uri == null) {
      logger.warning("parse(): InputSource has no SystemId (URI); document item URLs will not be resolvable.");
    }
    WritableLineReader wis;
    final Reader reader = is.getCharacterStream();
    if (reader != null) {
      wis = new WritableLineReader(reader);
    } else {
      final InputStream in = is.getByteStream();
      if (in != null) {
        wis = new WritableLineReader(new InputStreamReader(in, charset));
      } else if (uri != null) {
        throw new IllegalArgumentException("The input source didn't have a character stream, nor an inputstream!");
        /*
        // To comply with the InputSource documentation, we need
        // to do this:
        final java.net.URLConnection connection = new java.net.URL(uri).openConnection();
        in = connection.getInputStream();
        if (encoding == null) {
          charset = org.cobraparser.util.Urls.getCharset(connection);
        }
        wis = new WritableLineReader(new InputStreamReader(in, charset));
         */
      } else {
        throw new IllegalArgumentException("The InputSource must have either a reader, an input stream or a URI.");
      }
    }
    final HTMLDocumentImpl document = new HTMLDocumentImpl(this.bcontext, this.rcontext, wis, uri, contentType);
    return document;
  }

  @Override
  public boolean isNamespaceAware() {
    return false;
  }

  @Override
  public boolean isValidating() {
    return false;
  }

  @Override
  public void setEntityResolver(final EntityResolver er) {
    this.resolver = er;
  }

  @Override
  public void setErrorHandler(final ErrorHandler eh) {
    this.errorHandler = eh;
  }

  @Override
  public Document newDocument() {
    return new HTMLDocumentImpl(this.bcontext);
  }

  private DOMImplementation domImplementation;

  @Override
  public DOMImplementation getDOMImplementation() {
    synchronized (this) {
      if (this.domImplementation == null) {
        this.domImplementation = new DOMImplementationImpl(this.bcontext);
      }
      return this.domImplementation;
    }
  }

  public ErrorHandler getErrorHandler() {
    return errorHandler;
  }

  public EntityResolver getResolver() {
    return resolver;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy