All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.analyser.text.lang.LanguageIdentifier Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package uk.bl.wa.analyser.text.lang;

/*
 * #%L
 * warc-indexer
 * %%
 * Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public
 * License along with this program.  If not, see
 * .
 * #L%
 */

import uk.bl.wa.util.Instrument;

import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

/**
 * Identifier of the language that best matches a given content profile.
 * The content profile is compared to generic language profiles based on
 * material from various sources.
 *
 * @since Apache Tika 0.5
 * @see 
 *      Europarl: A Parallel Corpus for Statistical Machine Translation
 * @see 
 *      ISO 639 Language Codes
 */
public class LanguageIdentifier {
    
    /**
     * The available language profiles.
     */
    private static final Map PROFILES =
        new HashMap();
    private static final String PROFILE_SUFFIX = ".ngp";
    private static final String PROFILE_ENCODING = "UTF-8";

    private static Properties props = new Properties();
    private static String errors = "";

    private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties";
    private static final String PROPERTIES_FILE = "tika.language.properties";
    private static final String LANGUAGES_KEY = "languages";
    private static final double CERTAINTY_LIMIT = 0.022;

    private final String language;

    private final double distance;

    /*
     * Always attempt initializing language profiles when class is loaded first time
     */
    static {
        initProfiles();
    }
    
    /*
     * Add one language profile based on config in property file
     */
    private static void addProfile(String language) throws Exception {
        final long start = System.nanoTime();
        try {
            LanguageProfile profile = new LanguageProfile();

            InputStream stream = getResourceAsStream(language + PROFILE_SUFFIX);
            try {
                BufferedReader reader =
                    new BufferedReader(new InputStreamReader(stream, PROFILE_ENCODING));
                String line = reader.readLine();
                while (line != null) {
                    if (line.length() > 0 && !line.startsWith("#")) {
                        int space = line.indexOf(' ');
                        profile.add(
                                line.substring(0, space),
                                Long.parseLong(line.substring(space + 1)));
                    }
                    line = reader.readLine();
                }
            } finally {
                stream.close();
            }

            addProfile(language, profile);
        } catch (Throwable t) {
            throw new Exception("Failed trying to load language profile for language \""+language+"\". Error: "+t.getMessage());
        } finally {
            Instrument.timeRel("LanguageDetector.detectLanguage#li", "LanguageIdentifier.addProfile", start);
        }
    }

    private static InputStream getResourceAsStream(String path) {
        InputStream is;
        return (is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path)) != null ? is :
                org.apache.tika.language.LanguageIdentifier.class.getResourceAsStream(path);
        //return Thread.currentThread().getContextClassLoader().getResourceAsStream(path);
    }

    /**
     * Adds a single language profile
     * @param language an ISO 639 code representing language
     * @param profile the language profile
     */
    public static void addProfile(String language, LanguageProfile profile) {
        PROFILES.put(language, profile);
    }
    
    /**
     * Constructs a language identifier based on a LanguageProfile
     * @param profile the language profile
     */
    public LanguageIdentifier(LanguageProfile profile) {
        final long start = System.nanoTime();
        String minLanguage = "unknown";
        double minDistance = 1.0;
        for (Map.Entry entry : PROFILES.entrySet()) {
            double distance = profile.distance(entry.getValue());
            if (distance < minDistance) {
                minDistance = distance;
                minLanguage = entry.getKey();
            }
        }

        this.language = minLanguage;
        this.distance = minDistance;
        Instrument.timeRel("LanguageDetector.detectLanguage#li", "LanguageIdentifier#matchlanguageprofile", start);
    }

    /**
     * Constructs a language identifier based on a String of text content
     * @param content the text
     */
    public LanguageIdentifier(String content) {
        this(new LanguageProfile(content));
    }

    /**
     * Gets the identified language
     * @return an ISO 639 code representing the detected language
     */
    public String getLanguage() {
        return language;
    }

    /**
     * Tries to judge whether the identification is certain enough
     * to be trusted.
     * WARNING: Will never return true for small amount of input texts. 
     * @return true if the distance is smaller then {@value #CERTAINTY_LIMIT}, false otherwise
     */
    public boolean isReasonablyCertain() {
        return distance < CERTAINTY_LIMIT;
    }

    /**
     * Builds the language profiles.
     * The list of languages are fetched from a property file named "tika.language.properties"
     * If a file called "tika.language.override.properties" is found on classpath, this is used instead
     * The property file contains a key "languages" with values being comma-separated language codes
     */
    public static void initProfiles() {
        clearProfiles();
        
        errors = "";
        InputStream stream;
        stream = getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
        //stream = LanguageIdentifier.class.getResourceAsStream(PROPERTIES_OVERRIDE_FILE);
        if(stream == null) {
            stream = getResourceAsStream(PROPERTIES_FILE);
        }

        if (stream == null) {
            throw new RuntimeException(
                    "Unable to locate a properties stream for either " + PROPERTIES_OVERRIDE_FILE + " or "
                    + PROPERTIES_FILE);
        }

        if(stream != null){
            try {
                props = new Properties();
                props.load(stream);
            } catch (IOException e) {
                errors += "IOException while trying to load property file. Message: " + e.getMessage() + "\n";
            }
        }
        
        String[] languages = props.getProperty(LANGUAGES_KEY).split(",");
        for(String language : languages) {
            language = language.trim();
            String name = props.getProperty("name."+language, "Unknown");
            try {
                addProfile(language);
            } catch (Exception e) {
                errors += "Language " + language + " (" + name + ") not initialized. Message: " + e.getMessage() + "\n";
            }
        }
    }

    /**
     * Initializes the language profiles from a user supplied initialized Map.
     * This overrides the default set of profiles initialized at startup,
     * and provides an alternative to configuring profiles through property file
     *
     * @param profilesMap map of language profiles
     */
    public static void initProfiles(Map profilesMap) {
        clearProfiles();
        for(Map.Entry entry : profilesMap.entrySet()) {
            addProfile(entry.getKey(), entry.getValue());
        }
    }
    
    /**
     * Clears the current map of language profiles
     */
    public static void clearProfiles() {
        PROFILES.clear();
    }
    
    /**
     * Tests whether there were errors initializing language config
     * @return true if there are errors. Use getErrors() to retrieve.
     */
    public static boolean hasErrors() {
        return errors != "";
    }
    
    /**
     * Returns a string of error messages related to initializing langauge profiles
     * @return the String containing the error messages
     */
    public static String getErrors() {
        return errors;
    }
    
    /**
     * Returns what languages are supported for language identification
     * @return A set of Strings being the ISO 639 language codes
     */
    public static Set getSupportedLanguages() {
        return PROFILES.keySet();
    }

    @Override
    public String toString() {
        return language + " (" + distance + ")";
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy