All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.txt.CharsetRecog_2022 Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
*******************************************************************************
* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
package org.apache.tika.parser.txt;

/**
 *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
 *                           This is a superclass for the individual detectors for
 *                           each of the detectable members of the ISO 2022 family
 *                           of encodings.
 * 
 *                           The separate classes are nested within this class.
 * 
 * @internal
 */
abstract class CharsetRecog_2022 extends CharsetRecognizer {

    
    /**
     * Matching function shared among the 2022 detectors JP, CN and KR
     * Counts up the number of legal an unrecognized escape sequences in
     * the sample of text, and computes a score based on the total number &
     * the proportion that fit the encoding.
     * 
     * 
     * @param text the byte buffer containing text to analyse
     * @param textLen  the size of the text in the byte.
     * @param escapeSequences the byte escape sequences to test for.
     * @return match quality, in the range of 0-100.
     */
    int   match(byte [] text, int textLen, byte [][] escapeSequences) {
        int     i, j;
        int     escN;
        int     hits   = 0;
        int     misses = 0;
        int     shifts = 0;
        int     quality;
        scanInput:
            for (i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy