com.dell.doradus.fieldanalyzer.FieldAnalyzer Maven / Gradle / Ivy

Go to download
/*
 * Copyright (C) 2014 Dell, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dell.doradus.fieldanalyzer;

import java.lang.reflect.Method;
import java.util.Collection;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import java.util.Set;

import com.dell.doradus.common.CommonDefs;
import com.dell.doradus.common.FieldDefinition;
import com.dell.doradus.common.FieldType;
import com.dell.doradus.common.TableDefinition;
import com.dell.doradus.common.Utils;

/**
 * Abstract class for analyzers. This is tentative and likely to change.
 */
public abstract class FieldAnalyzer {
	
    // Default package in which we expect analyzer classes:
    private static final String DEFAULT_ANALYZER_PACKAGE = "com.dell.doradus.fieldanalyzer.";
    
    // Name of the "instance" method we expect each analyzer to have:
    private static final String INSTANCE_METHOD_NAME = "instance";
    
    // Cache of analyzer objects since findAnalyzer() is expensive. We use a Hashtable
    // since it is synchronized:
    private static final Map g_analyzerCache =
        new Hashtable();
    
    // Collection of compatible FieldTypes for this analyzer:
    private final Collection m_compatibleTypes;
    
    // Constructor
    protected FieldAnalyzer() {
        m_compatibleTypes = getCompatibleFieldTypes();
    }
    
    /**
     * This method is called once by the constructor to cache the set of field types
     * for which this analyzer can be used.
     *  
     * @return Collection of the {@link FieldType}s with which this analyzer can be used.
     */
    abstract protected Collection getCompatibleFieldTypes();
    
    /**
     * Return the singleton instance of the {@link FieldAnalyzer} class for the given
     * scalar field. If the field's analyzer name does not contain a package path, the
     * default {@value #DEFAULT_ANALYZER_PACKAGE} is prepended to it. If no analyzer with
     * the given name can be found, an IllegalArgumentException is thrown.
     * 
     * @param fieldDef  {@link FieldDefinition} of a scalar field.
     * @return          Singleton instance object defined for the scalar field.
     */
    public static FieldAnalyzer findAnalyzer(FieldDefinition fieldDef) {
        Utils.require(fieldDef.isScalarField(), "Must be a scalar field: %s", fieldDef);
        
        String analyzerName = fieldDef.getAnalyzerName();
        Utils.require(analyzerName != null, "Scalar field has no analyzer: %s", fieldDef);
        return findAnalyzer(analyzerName);
    }   // findAnalyzer
        
    /**
     * Return the singleton instance of the {@link FieldAnalyzer} for the given scalar
     * field belonging to the given table. If the scalar is not defined in the table, the
     * field is assumed to be text and the {@link TextAnalyzer} instance is returned.
     * Otherwise, the field definition's analyzer name is passed to {@link #findAnalyzer(String)}
     * whose result is returned. This method works with scalar fields that are not defined
     * in the table.
     * 
     * @param tableDef  Table that owns the scalar field.
     * @param fieldName Name of the scalar field.
     * @return          Singleton instance object of the analyzer that should be used to
     *                  index the given scalar field.
     */
    public static FieldAnalyzer findAnalyzer(TableDefinition tableDef, String fieldName) {
        assert tableDef != null;
        assert fieldName != null;
        
        FieldAnalyzer analyzer = TextAnalyzer.instance();
        FieldDefinition fieldDef = tableDef.getFieldDef(fieldName);
        if (fieldDef != null) {
            Utils.require(fieldDef.isScalarField(), "Must be a scalar field: " + fieldDef);
            analyzer = findAnalyzer(fieldDef.getAnalyzerName());
        }
        return analyzer;
    }   // findAnalyzer
    
    /**
     * Return the singleton instance of the {@link FieldAnalyzer} class for the given
     * analyzer name. If the given analyzer name does not contain a package path, the
     * default {@value #DEFAULT_ANALYZER_PACKAGE} is prepended to it. If no analyzer with
     * the given name can be found, an IllegalArgumentException is thrown.
     * 
     * @param analyzerName  Name of a field analyzer (e.g. "NullAnalyzer").
     * @return              Singleton analyzer object.
     */
    public static FieldAnalyzer findAnalyzer(String analyzerName) {
        Utils.require(!Utils.isEmpty(analyzerName), "analyzerName");
        
        // See if full package name was given.
        if (analyzerName.indexOf('.') < 0) {
            // Prepend default package name and, if needed, append "Analyzer".
            analyzerName = DEFAULT_ANALYZER_PACKAGE + analyzerName;
            if (!analyzerName.endsWith("Analyzer")) {
                analyzerName += "Analyzer";
            }
        }
        
        // Try the analyzer cache first.
        FieldAnalyzer fieldAnalyzer = g_analyzerCache.get(analyzerName);
        if (fieldAnalyzer != null) {
            return fieldAnalyzer;
        }
        
        // Analyzer not in the cache.
        try {
            // Attempt to load the corresponding class. This throws if the class cannot be found.
            @SuppressWarnings("unchecked")
            Class analyzerClass = (Class)Class.forName(analyzerName);
            
            // Attempt to find the "instance" static method. This also throws if not found.
            Method instanceMethod = analyzerClass.getMethod(INSTANCE_METHOD_NAME);
            
            // Attempt to invoke the instance method, which should return an instance of the
            // FieldAnalyzer we want. Since we expect it to be static, we pass null for the
            // "object" parameter. We also expect it to have no parameters, so we pass none.
            fieldAnalyzer = (FieldAnalyzer)instanceMethod.invoke(null);
            
            // Here, found it. Add to cache and return it.
            g_analyzerCache.put(analyzerName, fieldAnalyzer);   // OK if duplicate
            return fieldAnalyzer;
        } catch (Exception e) {
            // Didn't find the class or method we were looking for.
            throw new IllegalArgumentException("Analyzer not found: " + analyzerName);
        }
    }   // findAnalyzer
    
    /**
     * Verify that the analyzer defined for the given field is a known analyzer and that it
     * is valid for its declared type.  This method can only be called for scalar fields,
     * and the field's type and analyzer must be set. This method calls
     * {@link #findAnalyzer(FieldDefinition)} and then looks for the field's type in the
     * set of {@link FieldType}s returned by {@link #compatibleFieldTypes()}.
     * 
     * @param fieldDef  {@link FieldDefinition} of a scalar field.
     */
    public static void verifyAnalyzer(FieldDefinition fieldDef) {
        assert fieldDef != null;
        
        FieldAnalyzer analyzer = findAnalyzer(fieldDef);
        Utils.require(analyzer.compatibleFieldTypes().contains(fieldDef.getType()),
                      "Invalid analyzer for field type '%s': %s", fieldDef.getType(), fieldDef.getAnalyzerName());
    }   // verifyAnalyzer
    
    /**
     * Tokenize the given String value and return the array of tokens that should be indexed.
     * 
     * @param value Field value to be indexed as a String.
     * @return      List of terms that should be indexed.
     */
    abstract public String[] tokenize(String value);
    
    /**
     * Return the set of scalar {@link FieldType}s with which this analyzer can be used.
     * Each analyzer can be used for at least one field type.
     * 
     * @return  Collection of field types for which this analyzer can be used.
     */
    public Collection compatibleFieldTypes() {
        return m_compatibleTypes;
    }
    
    /**
     * Analyze the given String value and return the set of terms that should be indexed.
     * 
     * @param value Field value to be indexed as a binary value.
     * @return      Set of terms that should be indexed.
     */
    public Set extractTerms(String value) {
        try {
	        Set result = new HashSet();
    		Set split = Utils.split(value.toLowerCase(), CommonDefs.MV_SCALAR_SEP_CHAR);
    		for(String s : split) {
    			String[] tokens = tokenize(s);
		        for (String token : tokens) {
		            if (token.length() == 0) continue;
		            result.add(token);
		        }
    		}
	        return result;
        } catch (Exception e) {
            // Turn into an IllegalArgumentException
            throw new IllegalArgumentException("Error parsing field value: " + e.getLocalizedMessage());
        }
    }
    
    /**
     * Form the row key that should be used to index the given term, belonging to the
     * field with the given name. The analyzer determines if and how to use the field name
     * and term to create the term key. This method may throw if the analyzer thinks that
     * its terms should not be indexed.
     * 
     * @param fieldName Name of field in which term was found.
     * @param term      Value of term found.
     * @return          Row key that should be used in the Term record that represents the
     *                  term. The row key will be case-adjusted if applicable.
     */
    public String formTermKey(String fieldName, String term) {
    	return fieldName + "/" + term;
    }
    
    public static String makeTermKey(String field, String term) {
    	return field + "/" + term.toLowerCase();
    }

    public static String makeAllKey() {
    	return "_";
    }

}   // abstract class FieldAnalyzer