org.carrot2.text.linguistic.DefaultLexicalDataFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of carrot2-core Show documentation
Carrot2 search results clustering framework core, document sources and clustering algorithms.
There is a newer version: 4.6.0

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.linguistic;

import static org.carrot2.util.resource.ResourceLookup.Location.*;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.annotations.AspectModified;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.ResourceCache;
import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.carrotsearch.hppc.ObjectHashSet;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 * The default management of lexical resources. Resources are read from disk, cached and shared 
 * between all threads using this class. Additional attributes control resource reloading
 * and merging: {@link #resourceLookup}, {@link #reloadResources}, 
 * {@link #mergeResources}.
 */
@Bindable(inherit = LexicalDataLoader.class)
public class DefaultLexicalDataFactory implements ILexicalDataFactory
{
    /** */
    final static Logger logger = LoggerFactory.getLogger(DefaultLexicalDataFactory.class);

    private final static Function> resourceLoader =
        new Function>()
    {
        public java.util.HashMap apply(ResourceLookup resourceLookup) {
            return reloadResources(resourceLookup);
        }

        public boolean equals(Object other) {
            throw new UnsupportedOperationException();
        }

        public int hashCode()
        {
            throw new UnsupportedOperationException();
        }
    };
    
    /**
     * Static shared cache of lexical resources, keyed by a {@link ResourceLookup} 
     * used to search for resources. 
     */
    private final static ResourceCache> cache 
        = new ResourceCache>(resourceLoader);

    @Processing
    @Input
    @Attribute(key = "reload-resources", inherit = true)
    public boolean reloadResources = false;

    /**
     * Merges stop words and stop labels from all known languages. If set to
     * false, only stop words and stop labels of the active language will be
     * used. If set to true, stop words from all {@link org.carrot2.core.LanguageCode}s will
     * be used together and stop labels from all languages will be used together, no
     * matter the active language. Lexical resource merging is useful when clustering data
     * in a mix of different languages and should increase clustering quality in such
     * settings.
     */
    @Init
    @Processing
    @Input
    @Attribute(key = "merge-resources")
    @Label("Merge lexical resources")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.PREPROCESSING)
    public boolean mergeResources = true;

    @Init
    @Processing
    @Input 
    @Internal
    @Attribute(key = "resource-lookup", inherit = true)
    @ImplementingClasses(classes = {}, strict = false)
    @AspectModified("Substituted with an assembly lookup in .NET release")
    public ResourceLookup resourceLookup = new ResourceLookup(CONTEXT_CLASS_LOADER);

    /**
     * The main logic for acquiring a shared {@link ILexicalData} instance.
     */
    @Override
    public ILexicalData getLexicalData(LanguageCode languageCode)
    {
        // If resource merging is in place, change the language code to null
        // (dedicated cache key).
        if (mergeResources)
        {
            languageCode = null;
        }

        // Prepare cache key.
        ILexicalData lexicalData = cache.get(resourceLookup, reloadResources).get(languageCode);

        // Reset reload resources trigger.
        reloadResources = false;
        
        return lexicalData;
    }

    /**
     * Reload all lexical resources associated with the given key.
     */
    private static HashMap reloadResources(ResourceLookup resourceLookup)
    {
        // Load lexical resources.
        ObjectHashSet mergedStopwords = new ObjectHashSet<>();
        ArrayList mergedStoplabels = Lists.newArrayList();

        HashMap resourceMap = Maps.newHashMap();
        for (LanguageCode languageCode : LanguageCode.values())
        {
            final String isoCode = languageCode.getIsoCode();

            ObjectHashSet stopwords = toLower(load(resourceLookup, "stopwords." + isoCode));
            ArrayList stoplabels = 
                compile(load(resourceLookup, "stoplabels." + isoCode));

            mergedStopwords.addAll(stopwords);
            mergedStoplabels.addAll(stoplabels);
            
            resourceMap.put(languageCode, new DefaultLexicalData(stopwords, stoplabels));
        }
        resourceMap.put(null, new DefaultLexicalData(mergedStopwords, mergedStoplabels));

        return resourceMap;
    }

    /**
     * All entries to lowercase.
     */
    private static ObjectHashSet toLower(Set input)
    {
        ObjectHashSet cloned = 
            new ObjectHashSet(input.size());

        for (String entry : input)
        {
            char [] chars = entry.toCharArray();
            CharArrayUtils.toLowerCaseInPlace(chars);
            cloned.add(new MutableCharArray(chars));
        }

        return cloned;
    }

    /**
     * Compile patterns. 
     */
    private static ArrayList compile(HashSet patterns)
    {
        ArrayList compiled = new ArrayList(patterns.size());
        for (String pattern : patterns)
        {
            try
            {
                compiled.add(Pattern.compile(pattern));
            }
            catch (PatternSyntaxException e)
            {
                logger.warn("Ignoring invalid regular expression: " + pattern);
            }
        }

        return compiled;
    }

    /**
     * Attempts to load resourceName from the provided {@link ResourceLookup}.
     */
    private static HashSet load(ResourceLookup resourceLookup, String resourceName)
    {
        final IResource resource = resourceLookup.getFirst(resourceName);
        if (resource == null)
        {
            throw new RuntimeException(
                "No resource named " + resourceName + 
                " in resource lookup locations: " + 
                Arrays.toString(resourceLookup.getLocators()));
        }
        else
        {
            try
            {
                return load(resource);
            }
            catch (IOException e)
            {
                throw new RuntimeException(
                    "Resource named " + resourceName + 
                    " failed to load from: " + resource.toString());
            }
        }
    }

    /**
     * Loads words from a given {@link IResource} (UTF-8, one word per line, #-starting lines 
     * are considered comments).
     */
    public static HashSet load(IResource resource) throws IOException
    {
        final HashSet words = Sets.newHashSet();

        final InputStream is = resource.open();
        if (is == null)
            throw new IOException("Resource returned null stream: " + resource);

        final BufferedReader reader = new BufferedReader(new InputStreamReader(is,
            "UTF-8"));
        
        try
        {
    
            String line;
            while ((line = reader.readLine()) != null)
            {
                line = line.trim();
                if (line.startsWith("#") || line.length() == 0)
                {
                    continue;
                }
    
                words.add(line);
            }
        }
        finally
        {
            reader.close();
        }

        return words;
    }
}