info.monitorenter.cpdetector.io.ParsingDetector Maven / Gradle / Ivy

/*
 *  Encoding.java, a facade to an ANTLR grammar based
 *  parser / lexer that searches for the "charset" attribute of a
 *  html page.
 *  Copyright (C) 2004 Achim Westermann, created on 20.07.2004, 10:35:46
 *
 * ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this collection are subject to the Mozilla Public License Version 
 * 1.1 (the "License"); you may not use this file except in compliance with 
 * the License. You may obtain a copy of the License at 
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Original Code is the cpDetector code in [sub] packages info.monitorenter and 
 * cpdetector. 
 * 
 * The Initial Developer of the Original Code is
 * Achim Westermann .
 * 
 * Portions created by the Initial Developer are Copyright (c) 2007 
 * the Initial Developer. All Rights Reserved.
 * 
 * Contributor(s):
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 * 
 * ***** END LICENSE BLOCK ***** * 
 *  
 * If you modify or optimize the code in a useful way please let me know.
 * [email protected]
 */
package info.monitorenter.cpdetector.io;

import info.monitorenter.cpdetector.io.parser.EncodingLexer;
import info.monitorenter.cpdetector.io.parser.EncodingParser;
import info.monitorenter.io.LimitedInputStream;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;

import antlr.ANTLRException;

/**
 * 
 * A Fa�ade that
 * internally uses an ANTLR - based parser /
 * lexer.

 * 
 * 
 * The underlying lexer is more a filter: It does not verify lexical correctness
 * by the means of matching a defined order of tokens, but just filters m_out
 * certain tokens. By now the following tokens are filtered: 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 *       Token Name Match Lang. Specification
META_CONTENT_TYPE 
 * "meta" "http-equiv" "=" '"Content-Type"' "content" "=" '"' IDENTIFIER "charset" "=" <EncName> '"'>
 * HTML   W3C HTML 4.01 Specification  Chapter 5.2.2 
 XML_ENCODING_DECL  "<?xml" VersionInfo  "encoding" "=" <EncName>
 *  XML   Extensible
 * Markup Language (XML) 1.0 (Third Edition) Chapter 2.8 
 * 
 * 
 * @author Achim Westermann 
 * 
 */
public class ParsingDetector
    extends AbstractCodepageDetector {
  /**
   * Generated serialVersionUID.
   */
  private static final long serialVersionUID = 3618977875919778866L;

  private boolean m_verbose = false;

  public ParsingDetector() {
    this(false);
  }

  public ParsingDetector(boolean verbose) {
    super();
    this.m_verbose = verbose;
  }

  /*
   * (non-Javadoc)
   * 
   * @see cpdetector.io.ICodepageDetector#detectCodepage(java.io.InputStream)
   */
  public Charset detectCodepage(final InputStream in, final int length) throws IOException {
    EncodingLexer lexer;
    EncodingParser parser;
    Charset charset = null;
    String csName = null;
    InputStream limitedInputStream = new LimitedInputStream(in, length);
    if (this.m_verbose) {
      System.out
          .println("  parsing for html-charset/xml-encoding attribute with codepage: US-ASCII");
    }
    try {
      lexer = new EncodingLexer(new InputStreamReader(limitedInputStream, "US-ASCII"));
      parser = new EncodingParser(lexer);
      csName = parser.htmlDocument();
      if (csName != null) {
        // TODO: prepare document with illegal value, then test: Decide to catch
        // exception and return
        // UnsupportedCharset.
        try {
          charset = Charset.forName(csName);
        } catch (UnsupportedCharsetException uce) {
          charset = UnsupportedCharset.forName(csName);
        }
      } else {
        charset = UnknownCharset.getInstance();
      }
    } catch (ANTLRException ae) {
      if (this.m_verbose) {
        System.out.println("  ANTLR parser exception: " + ae.getMessage());
      }
    } catch (Exception deepdown) {
      if (this.m_verbose) {
        System.out.println("  Decoding Exception: " + deepdown.getMessage()
            + " (unsupported java charset).");
      }
      if (charset == null) {
        if (csName != null) {
          charset = UnsupportedCharset.forName(csName);
        } else {
          charset = UnknownCharset.getInstance();
        }
      }
    }
    return charset;
  }

}
Token Name	Match	Lang.	Specification
META_CONTENT_TYPE	* `"meta" "http-equiv" "=" '"Content-Type"' "content" "=" '"' IDENTIFIER "charset" "=" <EncName> '"'>` *	HTML	W3C HTML 4.01 Specification Chapter 5.2.2
XML_ENCODING_DECL	`"<?xml" VersionInfo "encoding" "=" <EncName>` *	XML	Extensible * Markup Language (XML) 1.0 (Third Edition) Chapter 2.8