org.apache.tika.language.LanguageProfile Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-core Show documentation
Show all versions of tika-core Show documentation
This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also
includes the core facades for the Tika API.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
/**
* Language profile based on ngram counts.
*
* @since Apache Tika 0.5
* @deprecated
*/
@Deprecated
public class LanguageProfile {
public static final int DEFAULT_NGRAM_LENGTH = 3;
private final int length;
/**
* The ngrams that make up this profile.
*/
private final Map ngrams =
new HashMap();
/**
* Sorted ngram cache for faster distance calculation.
*/
private Interleaved interleaved = new Interleaved();
public static boolean useInterleaved = true; // For testing purposes
/**
* The sum of all ngram counts in this profile.
* Used to calculate relative ngram frequency.
*/
private long count = 0;
private static class Counter {
private long count = 0;
public String toString() {
return Long.toString(count);
}
}
public LanguageProfile(int length) {
this.length = length;
}
public LanguageProfile() {
this(DEFAULT_NGRAM_LENGTH);
}
public LanguageProfile(String content, int length) {
this(length);
ProfilingWriter writer = new ProfilingWriter(this);
char[] ch = content.toCharArray();
writer.write(ch, 0, ch.length);
}
public LanguageProfile(String content) {
this(content, DEFAULT_NGRAM_LENGTH);
}
public long getCount() {
return count;
}
public long getCount(String ngram) {
Counter counter = ngrams.get(ngram);
if (counter != null) {
return counter.count;
} else {
return 0;
}
}
/**
* Adds a single occurrence of the given ngram to this profile.
*
* @param ngram the ngram
*/
public void add(String ngram) {
add(ngram, 1);
}
/**
* Adds multiple occurrences of the given ngram to this profile.
*
* @param ngram the ngram
* @param count number of occurrences to add
*/
public void add(String ngram, long count) {
if (length != ngram.length()) {
throw new IllegalArgumentException(
"Unable to add an ngram of incorrect length: "
+ ngram.length() + " != " + length);
}
Counter counter = ngrams.get(ngram);
if (counter == null) {
counter = new Counter();
ngrams.put(ngram, counter);
}
counter.count += count;
this.count += count;
}
/**
* Calculates the geometric distance between this and the given
* other language profile.
*
* @param that the other language profile
* @return distance between the profiles
*/
public double distance(LanguageProfile that) {
return useInterleaved ? distanceInterleaved(that) : distanceStandard(that);
}
private double distanceStandard(LanguageProfile that) {
if (length != that.length) {
throw new IllegalArgumentException(
"Unable to calculage distance of language profiles"
+ " with different ngram lengths: "
+ that.length + " != " + length);
}
double sumOfSquares = 0.0;
double thisCount = Math.max(this.count, 1.0);
double thatCount = Math.max(that.count, 1.0);
Set ngrams = new HashSet();
ngrams.addAll(this.ngrams.keySet());
ngrams.addAll(that.ngrams.keySet());
for (String ngram : ngrams) {
double thisFrequency = this.getCount(ngram) / thisCount;
double thatFrequency = that.getCount(ngram) / thatCount;
double difference = thisFrequency - thatFrequency;
sumOfSquares += difference * difference;
}
return Math.sqrt(sumOfSquares);
}
@Override
public String toString() {
return ngrams.toString();
}
/* Code for interleaved distance calculation below */
private double distanceInterleaved(LanguageProfile that) {
if (length != that.length) {
throw new IllegalArgumentException(
"Unable to calculage distance of language profiles"
+ " with different ngram lengths: "
+ that.length + " != " + length);
}
double sumOfSquares = 0.0;
double thisCount = Math.max(this.count, 1.0);
double thatCount = Math.max(that.count, 1.0);
Interleaved.Entry thisEntry = updateInterleaved().firstEntry();
Interleaved.Entry thatEntry = that.updateInterleaved().firstEntry();
// Iterate the lists in parallel, until both lists has been depleted
while (thisEntry.hasNgram() || thatEntry.hasNgram()) {
if (!thisEntry.hasNgram()) { // Depleted this
sumOfSquares += square(thatEntry.count / thatCount);
thatEntry.next();
continue;
}
if (!thatEntry.hasNgram()) { // Depleted that
sumOfSquares += square(thisEntry.count / thisCount);
thisEntry.next();
continue;
}
final int compare = thisEntry.compareTo(thatEntry);
if (compare == 0) { // Term exists both in this and that
double difference = thisEntry.count/thisCount - thatEntry.count/thatCount;
sumOfSquares += square(difference);
thisEntry.next();
thatEntry.next();
} else if (compare < 0) { // Term exists only in this
sumOfSquares += square(thisEntry.count/thisCount);
thisEntry.next();
} else { // Term exists only in that
sumOfSquares += square(thatEntry.count/thatCount);
thatEntry.next();
}
}
return Math.sqrt(sumOfSquares);
}
private double square(double count) {
return count * count;
}
private class Interleaved {
private char[] entries = null; // *
private int size = 0; // Number of entries (one entry = length+2 chars)
private long entriesGeneratedAtCount = -1; // Keeps track of when the sequential structure was current
/**
* Ensure that the entries array is in sync with the ngrams.
*/
public void update() {
if (count == entriesGeneratedAtCount) { // Already up to date
return;
}
size = ngrams.size();
final int numChars = (length+2)*size;
if (entries == null || entries.length < numChars) {
entries = new char[numChars];
}
int pos = 0;
for (Map.Entry entry: getSortedNgrams()) {
for (int l = 0 ; l < length ; l++) {
entries[pos + l] = entry.getKey().charAt(l);
}
entries[pos + length] = (char)(entry.getValue().count / 65536); // Upper 16 bit
entries[pos + length + 1] = (char)(entry.getValue().count % 65536); // lower 16 bit
pos += length + 2;
}
entriesGeneratedAtCount = count;
}
public Entry firstEntry() {
Entry entry = new Entry();
if (size > 0) {
entry.update(0);
}
return entry;
}
private List> getSortedNgrams() {
List> entries = new ArrayList>(ngrams.size());
entries.addAll(ngrams.entrySet());
Collections.sort(entries, new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o1.getKey().compareTo(o2.getKey());
}
});
return entries;
}
private class Entry implements Comparable {
char[] ngram = new char[length];
int count = 0;
int pos = 0;
private void update(int pos) {
this.pos = pos;
if (pos >= size) { // Reached the end
return;
}
final int origo = pos*(length+2);
System.arraycopy(entries, origo, ngram, 0, length);
count = entries[origo+length] * 65536 + entries[origo+length+1];
}
@Override
public int compareTo(Entry other) {
for (int i = 0 ; i < ngram.length ; i++) {
if (ngram[i] != other.ngram[i]) {
return ngram[i] - other.ngram[i];
}
}
return 0;
}
public boolean hasNext() {
return pos < size-1;
}
public boolean hasNgram() {
return pos < size;
}
public void next() {
update(pos+1);
}
public String toString() {
return new String(ngram) + "(" + count + ")";
}
}
}
private Interleaved updateInterleaved() {
interleaved.update();
return interleaved;
}
}