All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.language.LanguageProfile Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.language;


import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

/**
 * Language profile based on ngram counts.
 *
 * @since Apache Tika 0.5
 * @deprecated
 */
@Deprecated
public class LanguageProfile {

    public static final int DEFAULT_NGRAM_LENGTH = 3;

    private final int length;

    /**
     * The ngrams that make up this profile.
     */
    private final Map ngrams =
        new HashMap();

    /**
     * Sorted ngram cache for faster distance calculation.
     */
    private Interleaved interleaved = new Interleaved();
    public static boolean useInterleaved = true; // For testing purposes

    /**
     * The sum of all ngram counts in this profile.
     * Used to calculate relative ngram frequency.
     */
    private long count = 0;

    private static class Counter {
        private long count = 0;
        public String toString() {
            return Long.toString(count);
        }
    }

    public LanguageProfile(int length) {
        this.length = length;
    }

    public LanguageProfile() {
        this(DEFAULT_NGRAM_LENGTH);
    }

    public LanguageProfile(String content, int length) {
        this(length);

        ProfilingWriter writer = new ProfilingWriter(this);
        char[] ch = content.toCharArray();
        writer.write(ch, 0, ch.length);
    }

    public LanguageProfile(String content) {
        this(content, DEFAULT_NGRAM_LENGTH);
    }

    public long getCount() {
        return count;
    }

    public long getCount(String ngram) {
        Counter counter = ngrams.get(ngram);
        if (counter != null) {
            return counter.count;
        } else {
            return 0;
        }
    }

    /**
     * Adds a single occurrence of the given ngram to this profile.
     *
     * @param ngram the ngram
     */
    public void add(String ngram) {
        add(ngram, 1);
    }

    /**
     * Adds multiple occurrences of the given ngram to this profile.
     *
     * @param ngram the ngram
     * @param count number of occurrences to add
     */
    public void add(String ngram, long count) {
        if (length != ngram.length()) {
            throw new IllegalArgumentException(
                    "Unable to add an ngram of incorrect length: "
                    + ngram.length() + " != " + length);
        }

        Counter counter = ngrams.get(ngram);
        if (counter == null) {
            counter = new Counter();
            ngrams.put(ngram, counter);
        }
        counter.count += count;
        this.count += count;
    }

    /**
     * Calculates the geometric distance between this and the given
     * other language profile.
     *
     * @param that the other language profile
     * @return distance between the profiles
     */
    public double distance(LanguageProfile that) {
        return useInterleaved ? distanceInterleaved(that) : distanceStandard(that);
    }

    private double distanceStandard(LanguageProfile that) {
        if (length != that.length) {
            throw new IllegalArgumentException(
                    "Unable to calculage distance of language profiles"
                    + " with different ngram lengths: "
                    + that.length + " != " + length);
        }

        double sumOfSquares = 0.0;
        double thisCount = Math.max(this.count, 1.0);
        double thatCount = Math.max(that.count, 1.0);

        Set ngrams = new HashSet();
        ngrams.addAll(this.ngrams.keySet());
        ngrams.addAll(that.ngrams.keySet());
        for (String ngram : ngrams) {
            double thisFrequency = this.getCount(ngram) / thisCount;
            double thatFrequency = that.getCount(ngram) / thatCount;
            double difference = thisFrequency - thatFrequency;
            sumOfSquares += difference * difference;
        }

        return Math.sqrt(sumOfSquares);
    }

    @Override
    public String toString() {
        return ngrams.toString();
    }

    /* Code for interleaved distance calculation below */

    private double distanceInterleaved(LanguageProfile that) {
        if (length != that.length) {
            throw new IllegalArgumentException(
                    "Unable to calculage distance of language profiles"
                    + " with different ngram lengths: "
                    + that.length + " != " + length);
        }
       
        double sumOfSquares = 0.0;
        double thisCount = Math.max(this.count, 1.0);
        double thatCount = Math.max(that.count, 1.0);
        
        Interleaved.Entry thisEntry = updateInterleaved().firstEntry();
        Interleaved.Entry thatEntry = that.updateInterleaved().firstEntry();

        // Iterate the lists in parallel, until both lists has been depleted
        while (thisEntry.hasNgram() || thatEntry.hasNgram()) {
            if (!thisEntry.hasNgram()) { // Depleted this
                sumOfSquares += square(thatEntry.count / thatCount);
                thatEntry.next();
                continue;
            }

            if (!thatEntry.hasNgram()) { // Depleted that
                sumOfSquares += square(thisEntry.count / thisCount);
                thisEntry.next();
                continue;
            }

            final int compare = thisEntry.compareTo(thatEntry);

            if (compare == 0) { // Term exists both in this and that
                double difference = thisEntry.count/thisCount - thatEntry.count/thatCount;
                sumOfSquares += square(difference);
                thisEntry.next();
                thatEntry.next();
            } else if (compare < 0) { // Term exists only in this
                sumOfSquares += square(thisEntry.count/thisCount);
                thisEntry.next();
            } else { // Term exists only in that
                sumOfSquares += square(thatEntry.count/thatCount);
                thatEntry.next();
            }
        }
        return Math.sqrt(sumOfSquares);
    }
    private double square(double count) {
        return count * count;
    }

    private class Interleaved {

        private char[] entries = null; // *
        private int size = 0; // Number of entries (one entry = length+2 chars)
        private long entriesGeneratedAtCount = -1; // Keeps track of when the sequential structure was current

        /**
         * Ensure that the entries array is in sync with the ngrams.
         */
        public void update() {
            if (count == entriesGeneratedAtCount) { // Already up to date
                return;
            }
            size = ngrams.size();
            final int numChars = (length+2)*size;
            if (entries == null || entries.length < numChars) {
                entries = new char[numChars];
            }
            int pos = 0;
            for (Map.Entry entry: getSortedNgrams()) {
                for (int l = 0 ; l < length ; l++) {
                    entries[pos + l] = entry.getKey().charAt(l);
                }
                entries[pos + length] = (char)(entry.getValue().count / 65536); // Upper 16 bit
                entries[pos + length + 1] = (char)(entry.getValue().count % 65536); // lower 16 bit
                pos += length + 2;
            }
            entriesGeneratedAtCount = count;
        }

        public Entry firstEntry() {
            Entry entry = new Entry();
            if (size > 0) {
                entry.update(0);
            }
            return entry;
        }
        
        private List> getSortedNgrams() {
            List> entries = new ArrayList>(ngrams.size());
            entries.addAll(ngrams.entrySet());
            Collections.sort(entries, new Comparator>() {
                @Override
                public int compare(Map.Entry o1, Map.Entry o2) {
                    return o1.getKey().compareTo(o2.getKey());
                }
            });
            return entries;
        }
        
        private class Entry implements Comparable {
            char[] ngram = new char[length];
            int count = 0;
            int pos = 0;

            private void update(int pos) {
                this.pos = pos;
                if (pos >= size) { // Reached the end
                    return;
                }
                final int origo = pos*(length+2);
                System.arraycopy(entries, origo, ngram, 0, length);
                count = entries[origo+length] * 65536 + entries[origo+length+1];
            }

            @Override
            public int compareTo(Entry other) {
                for (int i = 0 ; i < ngram.length ; i++) {
                    if (ngram[i] != other.ngram[i]) {
                        return ngram[i] - other.ngram[i];
                    }
                }
                return 0;
            }
            public boolean hasNext() {
                return pos < size-1;
            }
            public boolean hasNgram() {
                return pos < size;
            }
            public void next() {
                update(pos+1);
            }
            public String toString() {
                return new String(ngram) + "(" + count + ")";
            }
        }
    }
    private Interleaved updateInterleaved() {
        interleaved.update();
        return interleaved;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy