info.monitorenter.cpdetector.io.ByteOrderMarkDetector Maven / Gradle / Ivy

Go to download
/*
 * ByteOrderMarkDetector.java, . Copyright (C) 2005 Achim Westermann, [email protected]
 * 
 * ***** BEGIN LICENSE BLOCK ***** Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this collection are subject to the Mozilla Public License Version 1.1 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations under the License.
 * 
 * The Original Code is the cpDetector code in [sub] packages info.monitorenter and cpdetector.
 * 
 * The Initial Developer of the Original Code is Achim Westermann .
 * 
 * Portions created by the Initial Developer are Copyright (c) 2007 the Initial Developer. All Rights Reserved.
 * 
 * Contributor(s):
 * 
 * Alternatively, the contents of this file may be used under the terms of either the GNU General Public License Version 2 or later (the
 * "GPL"), or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), in which case the provisions of the GPL or the LGPL
 * are applicable instead of those above. If you wish to allow use of your version of this file only under the terms of either the GPL or
 * the LGPL, and not to allow others to use your version of this file under the terms of the MPL, indicate your decision by deleting the
 * provisions above and replace them with the notice and other provisions required by the GPL or the LGPL. If you do not delete the
 * provisions above, a recipient may use your version of this file under the terms of any one of the MPL, the GPL or the LGPL.
 * 
 * ***** END LICENSE BLOCK ***** *
 * 
 * If you modify or optimize the code in a useful way please let me know. [email protected]
 */
package info.monitorenter.cpdetector.io;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;

/**
 * 
 * This detector identifies byte order marks of the following codepages to give a 100 %
 * deterministic result in case of detection.
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 00 00 FE FF UCS-4, big-endian machine (1234 order)
FF FE 00 00 UCS-4,little-endian machine (4321 order)
00 00 FF FE UCS-4, unusual octet order (2143)
FE FF 00 00 UCS-4, unusual octet order (3412)
FE FF ## ## UTF-16, big-endian
FF FE ## ## UTF-16, little-endian
EF BB BF UTF-8
 * 
 * 
 * Note that this detector is very fast as it only has to read a maximum of 8 bytes to provide a
 * result. Nevertheless it is senseless to add it to the configuration if the documents to detect
 * will have a low rate of documents in the codepages that will be detected. If added to the
 * configuration of {@link info.monitorenter.cpdetector.io.CodepageDetectorProxy} it should be at
 * front position to save computations of the following detection processses.
 * 

 * 

 * This implementation is based on: 

 * W3C XML Specification
 * 1.0 3rd Edition, F.1 Detection Without External Encoding Information .
 * 
 * This implementation does the same as {@link info.monitorenter.cpdetector.io.UnicodeDetector} but with a different
 * read strategy (read each byte separately) and a switch case tree (-> bytecode tableswitch). Would
 * be great to have a performance comparison. Maybe the read of 4 bytes in a row combined with the
 * switch could make this implementation the winner.
 * 
 * 
 * @author Achim Westermann 
 * @version $Revision: 1.3 $
 */
public class ByteOrderMarkDetector
    extends AbstractCodepageDetector implements ICodepageDetector {

  /**
   * Generated serialVersionUID.
   */
  private static final long serialVersionUID = 3618977875919778866L;

  /**
   * @see ICodepageDetector#detectCodepage(java.io.InputStream, int)
   */
  public Charset detectCodepage(final InputStream in, final int length) throws IOException {
    // dumbest pragmatic code ever written (nearly a code generator would have
    // been faster). But it's proven fast.
    Charset result = UnknownCharset.getInstance();
    int readByte = 0;
    readByte = in.read();
    switch (readByte) {
      case (0x00): {
        // 0x 00
        readByte = in.read();
        switch (readByte) {
          case (0x00): {
            // 0x 00 00
            readByte = in.read();
            switch (readByte) {
              case (0xFE): {
                // 0x 00 00 FE
                // UCS-4, big-endian machine (1234 order)
                try {
                  result = Charset.forName("UCS-4BE");
                } catch (UnsupportedCharsetException uce) {
                  result = UnsupportedCharset.forName("UCS-4BE");
                }
                return result;

              }
              case (0xFF): {
                // 0x 00 00 FF
                // UCS-4, unusual octet order (2143)
                try {
                  result = Charset.forName("UCS-4");
                } catch (UnsupportedCharsetException uce) {
                  result = UnsupportedCharset.forName("UCS-4");
                }
                return result;

              }
              default:
                return result;
            }

          }
          default:
            return result;
        }
      }
      case (0xFE): {
        // 0x FE
        readByte = in.read();
        switch (readByte) {
          case (0xFF): {
            // 0x FE FF
            // from here on default to UTF-16, big-endian
            readByte = in.read();
            switch (readByte) {
              case (0x00): {
                // 0x FE FF 00
                readByte = in.read();
                switch (readByte) {
                  case (0x00): {
                    // 0x FE FF 00 00
                    // UCS-4, unusual octet order (3412)
                    try {
                      result = Charset.forName("UCS-4");
                    } catch (UnsupportedCharsetException uce) {
                      result = UnsupportedCharset.forName("UCS-4");
                    }
                    return result;
                  }
                  default: {
                    try {
                      result = Charset.forName("UTF-16BE");
                    } catch (UnsupportedCharsetException uce) {
                      result = UnsupportedCharset.forName("UTF-16BE");
                    }
                    return result;
                  }
                }

              }
              default: {
                try {
                  result = Charset.forName("UTF-16BE");
                } catch (UnsupportedCharsetException uce) {
                  result = UnsupportedCharset.forName("UTF-16BE");
                }
                return result;
              }

            }

          }
          default: {
            return result;
          }
        }
      }

      case (0xFF): {
        // 0x FF
        readByte = in.read();
        switch (readByte) {
          case (0xFE): {
            // 0x FF FE
            // from here on default to UTF-16, little-endian
            readByte = in.read();
            switch (readByte) {
              case (0x00): {
                // 0x FF FE 00
                readByte = in.read();
                switch (readByte) {
                  case (0x00): {
                    // 0x FF FE 00 00
                    // UCS-4, little-endian machine (4321 order)
                    try {
                      result = Charset.forName("UCS-4LE");
                    } catch (UnsupportedCharsetException uce) {
                      result = UnsupportedCharset.forName("UCS-4LE");
                    }
                    return result;

                  }
                  default: {
                    try {
                      result = Charset.forName("UTF-16LE");
                    } catch (UnsupportedCharsetException uce) {
                      result = UnsupportedCharset.forName("UTF-16LE");
                    }
                    return result;
                  }
                }
              }
              default: {
                try {
                  result = Charset.forName("UTF-16LE");
                } catch (UnsupportedCharsetException uce) {
                  result = UnsupportedCharset.forName("UTF-16LE");
                }
                return result;
              }
            }
          }
          default: {
            return result;
          }
        }
      }
      case (0xEF): {
        // 0x EF
        readByte = in.read();
        switch (readByte) {
          case (0xBB): {
            // 0x EF BB
            readByte = in.read();
            switch (readByte) {
              case (0xBF): {
                try {
                  result = Charset.forName("utf-8");
                } catch (UnsupportedCharsetException uce) {
                  result = UnsupportedCharset.forName("utf-8");
                }
                return result;

              }
              default: {
                return result;
              }
            }

          }
          default: {
            return result;
          }
        }

      }
      default:
        return result;

    }
  }

  /**
   * 

   * Delegates to {@link #detectCodepage(java.io.InputStream, int)}with a buffered input stream of size 10
   * (8 needed as maximum).
   * 
   * 
   * @see ICodepageDetector#detectCodepage(java.net.URL)
   */
  public Charset detectCodepage(URL url) throws IOException {
    Charset result;
    BufferedInputStream in = new BufferedInputStream(url.openStream());
    result = this.detectCodepage(in, Integer.MAX_VALUE);
    in.close();
    return result;
  }
}
00 00 FE FF	UCS-4, big-endian machine (1234 order)
FF FE 00 00	UCS-4,little-endian machine (4321 order)
00 00 FF FE	UCS-4, unusual octet order (2143)
FE FF 00 00	UCS-4, unusual octet order (3412)
FE FF ## ##	UTF-16, big-endian
FF FE ## ##	UTF-16, little-endian
EF BB BF	UTF-8