All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.detect.TextStatistics Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

/**
 * Utility class for computing a histogram of the bytes seen in a stream.
 *
 * @since Apache Tika 1.2
 */
public class TextStatistics {

    private final int[] counts = new int[256];

    private int total = 0;

    public void addData(byte[] buffer, int offset, int length) {
        for (int i = 0; i < length; i++) {
            counts[buffer[offset + i] & 0xff]++;
            total++;
        }
    }

    /**
     * Checks whether at least one byte was seen and that the bytes that
     * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
     *
     * @see TIKA-483
     * @see TIKA-688
     * @return true if the seen bytes were mostly safe ASCII,
     *         false otherwise
     */
    public boolean isMostlyAscii() {
        int control = count(0, 0x20);
        int ascii = count(0x20, 128);
        int safe = countSafeControl();
        return total > 0
                && (control - safe) * 100 < total * 2
                && (ascii + safe) * 100 > total * 90;
    }

    /**
     * Checks whether the observed byte stream looks like UTF-8 encoded text.
     *
     * @since Apache Tika 1.3
     * @return true if the seen bytes look like UTF-8,
     *         false otherwise
     */
    public boolean looksLikeUTF8() {
        int control = count(0, 0x20);
        int utf8 = count(0x20, 0x80);
        int safe = countSafeControl();

        int expectedContinuation = 0;
        int[] leading = new int[] {
                count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8) };
        for (int i = 0; i < leading.length; i++) {
            utf8 += leading[i];
            expectedContinuation += (i + 1) * leading[i];
        }

        int continuation = count(0x80, 0xc0);
        return utf8 > 0
                && continuation <= expectedContinuation
                && continuation >= expectedContinuation - 3
                && count(0xf80, 0x100) == 0
                && (control - safe) * 100 < utf8 * 2;
    }

    /**
     * Returns the total number of bytes seen so far.
     *
     * @return count of all bytes
     */
    public int count() {
        return total;
    }

    /**
     * Returns the number of occurrences of the given byte.
     *
     * @param b byte
     * @return count of the given byte
     */
    public int count(int b) {
        return counts[b & 0xff];
    }

    /**
     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
     * page feed and escape).
     * 

* This definition of control characters is based on section 4 of the * "Content-Type Processing Model" Internet-draft * (draft-abarth-mime-sniff-01). *

     * +-------------------------+
     * | Binary data byte ranges |
     * +-------------------------+
     * | 0x00 -- 0x08            |
     * | 0x0B                    |
     * | 0x0E -- 0x1A            |
     * | 0x1C -- 0x1F            |
     * +-------------------------+
     * 
* * @see TIKA-154 * @return count of control characters */ public int countControl() { return count(0, 0x20) - countSafeControl(); } /** * Counts "safe" (i.e. seven-bit non-control) ASCII characters. * * @see #countControl() * @return count of safe ASCII characters */ public int countSafeAscii() { return count(0x20, 128) + countSafeControl(); } /** * Counts eight bit characters, i.e. bytes with their highest bit set. * * @return count of eight bit characters */ public int countEightBit() { return count(128, 256); } private int count(int from, int to) { assert 0 <= from && to <= counts.length; int count = 0; for (int i = from; i < to; i++) { count += counts[i]; } return count; } private int countSafeControl() { return count('\t') + count('\n') + count('\r') // tab, LF, CR + count(0x0c) + count(0x1b); // new page, escape } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy