org.apache.tika.parser.txt.CharsetRecog_UTF8 Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 3.0.0-BETA2
/**
*******************************************************************************
* Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
package org.apache.tika.parser.txt;

/**
 * Charset recognizer for UTF-8
 *
 * @internal
 */
class CharsetRecog_UTF8 extends CharsetRecognizer {

    String getName() {
        return "UTF-8";
    }

    /* (non-Javadoc)
     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
     */
    int match(CharsetDetector det) {
        boolean     hasBOM = false;
        int         numValid = 0;
        int         numInvalid = 0;
        byte        input[] = det.fRawInput;
        int         i;
        int         trailBytes = 0;
        int         confidence;
        
        if (det.fRawLength >= 3 && 
                (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) {
            hasBOM = true;
        }
        
        // Scan for multi-byte sequences
        for (i=0; i 5) {
                    break;
                }
                trailBytes = 0;
            }
                
            // Verify that we've got the right number of trail bytes in the sequence
            for (;;) {
                i++;
                if (i>=det.fRawLength) {
                    break;
                }
                b = input[i];
                if ((b & 0xc0) != 0x080) {
                    numInvalid++;
                    break;
                }
                if (--trailBytes == 0) {
                    numValid++;
                    break;
                }
            }
                        
        }
        
        // Cook up some sort of confidence score, based on presense of a BOM
        //    and the existence of valid and/or invalid multi-byte sequences.
        confidence = 0;
        if (hasBOM && numInvalid==0) {
            confidence = 100;
        } else if (hasBOM && numValid > numInvalid*10) {
            confidence = 80;
        } else if (numValid > 3 && numInvalid == 0) {
            confidence = 100;            
        } else if (numValid > 0 && numInvalid == 0) {
            confidence = 80;
        } else if (numValid == 0 && numInvalid == 0) {
            // Plain ASCII.  
            confidence = 10;            
        } else if (numValid > numInvalid*10) {
            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
            confidence = 25;
        }
        return confidence;
    }

}