All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bigml.binding.ModelFields Maven / Gradle / Ivy

Go to download

An open source Java client that gives you a simple binding to interact with BigML. You can use it to easily create, retrieve, list, update, and delete BigML resources.

There is a newer version: 2.1.1
Show newest version
package org.bigml.binding;

import org.apache.commons.text.StringEscapeUtils;
import org.bigml.binding.utils.Utils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Serializable;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A ModelFields resource.
 * 
 * This module defines a ModelFields class to hold the information 
 * associated to the fields of the model resource in BigML.
 * It becomes the starting point for the Model class, that is used for 
 * local predictions.
 * 
 */
public class ModelFields implements Serializable {

    private static final long serialVersionUID = 1L;

    // Logging
    Logger LOGGER = LoggerFactory.getLogger(ModelFields.class);

    private static String DEFAULT_LOCALE = "en_US.UTF-8";
    
    public static String[] DEFAULT_MISSING_TOKENS = Fields.DEFAULT_MISSING_TOKENS;
    
    public static HashMap FIELDS_PARENT = 
    		new HashMap();
    static {
    	FIELDS_PARENT.put("cluster", "clusters");
    	FIELDS_PARENT.put("logisticregression", "logistic_regression");
    	FIELDS_PARENT.put("ensemble", "ensemble");
    	FIELDS_PARENT.put("deepnet", "deepnet");
    }
    

    protected String objectiveFieldId;
    protected String objectiveFieldName;
    protected List fieldsName;
    protected List fieldsId;
    protected Map fieldsIdByName;
    protected Map fieldsNameById;

    protected List missingTokens;
    protected JSONObject fields = null;
    protected JSONObject invertedFields = null;
    protected String dataLocale = null;
    
    protected Boolean missingNumerics = null;
    protected JSONObject termForms = new JSONObject();
    protected Map> tagClouds = 
    		new HashMap>();
    protected JSONObject termAnalysis = new JSONObject();
    protected JSONObject itemAnalysis = new JSONObject();
    protected Map> items = 
    		new HashMap>();
    protected JSONObject categories = new JSONObject();
    protected JSONObject numericFields = new JSONObject();
    

    /**
     * The constructor can be instantiated with nothing inside.
     *
     * We will need to invoke the initialize in overridden classes
     */
    protected ModelFields() {
    }

    /**
     * The constructor can be instantiated with the fields structure.
     * The structure is checked and fields structure is returned if a resource type is matched.
     *
     * @param fields the resource that hold the fields structure
     */
    public ModelFields(JSONObject fields) {
        initialize(fields, null, null, null);
    }

    /**
     * The constructor can be instantiated with the fields structure.
     * The structure is checked and fields structure is returned if a resource type is matched.
     *
     * @param fields the resource that hold the fields structure
     */
    public ModelFields(JSONObject fields, String objectiveFieldId, String dataLocale,
                       List missingTokens) {
        initialize(fields, objectiveFieldId, dataLocale, missingTokens);
    }

    /**
     * The constructor can be instantiated with fields structure.
     *
     * @param fields the fields structure itself
     * @param objectiveFieldId the ID of the objective field
     * @param missingTokens the list of missing tokens to use. DEFAULT_MISSING_TOKENS will be used by default
     * @param dataLocale the locale of the data
     */
    protected void initialize(
    		JSONObject fields, String objectiveFieldId, String dataLocale,
            List missingTokens) {
    	
    	initialize(fields, objectiveFieldId, dataLocale, missingTokens,
    			   false, false, false);
    }
    
    /**
     * The constructor can be instantiated with fields structure.
     *
     * @param fields the fields structure itself
     * @param objectiveFieldId the ID of the objective field
     * @param missingTokens the list of missing tokens to use. DEFAULT_MISSING_TOKENS will be used by default
     * @param dataLocale the locale of the data
     */
    protected void initialize(JSONObject fields, String objectiveFieldId, 
    		String dataLocale, List missingTokens, Boolean terms, 
    		Boolean categories, Boolean numerics) {

        this.fields = new JSONObject();
        this.fields.putAll(fields);

        this.objectiveFieldId = objectiveFieldId;
        if( this.objectiveFieldId != null ) {
            this.objectiveFieldName = Utils.getJSONObject(
            		fields, objectiveFieldId + ".name").toString();
        }

        uniquifyNames(this.fields);
        this.invertedFields = Utils.invertDictionary(fields, "name");

        this.missingTokens = missingTokens;
        if( this.missingTokens == null ) {
            this.missingTokens = new ArrayList(
            		Arrays.asList(DEFAULT_MISSING_TOKENS));
        }
        
        this.dataLocale = dataLocale;
        if( this.dataLocale == null ) {
            this.dataLocale = DEFAULT_LOCALE;
        }
                
        if (categories) {
        	this.categories = new JSONObject();
        }
        
        if (terms || categories || numerics) {
        	addTerms(categories, numerics);
        }
        
    }
    
    
    /**
     * Adds the terms information of text and items fields
     * 
     */
    private void addTerms(boolean categories, boolean numerics) {
    	for (Object fieldId : fields.keySet()) {
            JSONObject field = (JSONObject) fields.get(fieldId);
            
            if ("text".equals(field.get("optype"))) {
            	termForms.put(fieldId, 
            		Utils.getJSONObject(field, "summary.term_forms", new JSONObject()));
            	
            	List fieldTagClouds = new ArrayList();
            	JSONArray tags = (JSONArray) Utils.getJSONObject(field, "summary.tag_cloud", new JSONArray());
                for (Object tag : tags) {
                	JSONArray tagArr = (JSONArray) tag;
                	fieldTagClouds.add(tagArr.get(0).toString());
                }
                tagClouds.put(fieldId.toString(), fieldTagClouds);
            	
            	termAnalysis.put(fieldId, Utils.getJSONObject(field, "term_analysis", new JSONObject()));
            }
            
            if ("items".equals(field.get("optype"))) {
                List fieldItems = new ArrayList();
                JSONArray itemsArray = (JSONArray) Utils.getJSONObject(field, "summary.items", new JSONArray());
                for (Object item : itemsArray) {
                	JSONArray itemArr = (JSONArray) item;
                	fieldItems.add(itemArr.get(0).toString());
                }
                items.put(fieldId.toString(), fieldItems);
            	
            	itemAnalysis.put(fieldId, Utils.getJSONObject(field, "item_analysis", new JSONObject()));
            }
            
            if (categories && "categorical".equals(field.get("optype"))) {
            	JSONArray cats = (JSONArray) Utils.getJSONObject(
            			field, "summary.categories", new JSONArray());
            	
            	JSONArray categoriesList = new JSONArray();
            	for (Object category : cats) {
            		categoriesList.add(((JSONArray) category).get(0));
            	}
            	this.categories.put(fieldId, categoriesList);
            }
            
            if (numerics && this.missingNumerics != null &&
            		"numeric".equals(field.get("optype"))) {
            	this.numericFields.put(fieldId, true);
            }

        }
    }
    

    /**
     * Checks the model structure to see if it contains all the needed keys
     * 
     */
    protected boolean checkModelStructure(JSONObject model) {
    	return checkModelStructure(model, "model");
    }
    
    
    /**
     * Checks the model structure to see if it contains all the needed keys
     */
    protected boolean checkModelStructure(JSONObject model, String innerKey) {
        return model.containsKey("resource") && 
        		model.get("resource") != null &&
                (model.containsKey("object") &&
                    Utils.getJSONObject(model, "object." + innerKey, null) != null ||
                    model.containsKey(innerKey) );
    }
    
    /**
     * Checks the model structure to see whether it contains the required
     * fields information
     */
    protected boolean checkModelFields(JSONObject model) {
    	if (!model.containsKey("resource") ||
        		model.get("resource") == null) {
    		return false;
    	}
    	
    	String resource = (String) model.get("resource");
    	String innerKey = "model";
    	if (FIELDS_PARENT.containsKey(resource.split("/")[0])) {
    		innerKey = FIELDS_PARENT.get(resource.split("/")[0]);
    	}
    	
    	if (checkModelStructure(model, innerKey)) {
    		model = (JSONObject) Utils.getJSONObject(model, "object", model);
    		
    		JSONObject modelObj = (JSONObject) Utils.getJSONObject(
    				model, innerKey, new JSONObject());
    		
    		JSONObject fields = (JSONObject) Utils.getJSONObject(
    				model, "fields", modelObj.get("fields"));
    		
    		// models only need model_fields to work. The rest of 
    		// resources will need all fields to work
    		JSONObject modelFields = (JSONObject) modelObj.get("model_fields");
    		if (modelFields == null) {
    			JSONObject fieldsMeta = (JSONObject) Utils.getJSONObject(
        				model, "fields_meta", modelObj.get("fields_meta"));
    			try {
    				return fieldsMeta.get("count") == fieldsMeta.get("total");
    			} catch (Exception e) {
    				// stored old models will not have the fields_meta info, 
    				// sowe return True to avoid failing in this case
    				return true;
    			}
    		} else {
    			if (fields == null) {
    				return false;
    			}
    			
    			Iterator iter = modelFields.keySet().iterator();
    			while (iter.hasNext()) {
    				String key = (String) iter.next();
    				if (!fields.containsKey(key)) {
    					return false;
    				}
    			}
    			
    			return true;
    		}
    	}
    	
    	return false;
    }
    
    /**
     * Filters the keys given in input_data checking against model fields.
     *
     * @param inputData
     */
    protected JSONObject filterInputData(JSONObject inputData) {
    	JSONObject filteredInputData = filterInputData(inputData, false);
    	return (JSONObject) filteredInputData.get("newInputData");
    }
    
    
    /**
     * Filters the keys given in input_data checking against model fields.
     * 
     * If `addUnusedFields` is set to True, it also provides 
     * information about the ones that are not used.
     *
     * @param inputData
     * @param addUnusedFields
     */
    protected JSONObject filterInputData(JSONObject inputData, 
    									 Boolean addUnusedFields) {
    	
    	if (addUnusedFields == null) {
    		addUnusedFields = false;
    	}
    	
    	// remove all missing values
    	Iterator fieldIdItr = inputData.keySet().iterator();
        while(fieldIdItr.hasNext()) {
            String fieldId = fieldIdItr.next();
            Object value = inputData.get(fieldId);
            value = normalize(value);
            if( value == null ) {
                fieldIdItr.remove();
            }
        }

        JSONObject newInputData = new JSONObject();
        List unusedFields = new ArrayList();
        for (Object fieldId : inputData.keySet()) {
            Object value = inputData.get(fieldId);

            if( fieldsIdByName.containsKey(fieldId) ) {
                fieldId = fieldsIdByName.get(fieldId.toString());
            }
            
            if( fieldsId.contains(fieldId) &&
                    (objectiveFieldId == null ||
                            !fieldId.equals(objectiveFieldId)) ) {
                newInputData.put(fieldId, value);
            } else {
            	unusedFields.add((String) fieldId);
            }
        }
        
        JSONObject result = new JSONObject();
        result.put("newInputData", newInputData);
        result.put("unusedFields", unusedFields);

        return result;
    }

    /**
     * Tests if the fields names are unique. If they aren't, a
     * transformation is applied to ensure unicity.
     */
    protected void uniquifyNames(JSONObject fields) {

        fieldsName = new ArrayList(fields.size());
        fieldsId = new ArrayList(fields.size());

        fieldsIdByName = new HashMap();
        fieldsNameById = new HashMap();

        for (Object fieldId : fields.keySet()) {
            fieldsId.add(fieldId.toString());

            String name = Utils.getJSONObject((JSONObject)
                    fields.get(fieldId), "name").toString();
            fieldsName.add(name);

            fieldsIdByName.put(name, fieldId.toString());
            fieldsNameById.put(fieldId.toString(), name);
        }

        Set uniqueNames = new TreeSet(fieldsName);
        if( uniqueNames.size() < fieldsName.size() ) {
            transformRepeatedNames(fields);
        }
    }

    /**
     * If a field name is repeated, it will be transformed adding its
     * column number. If that combination is also a field name, the
     * field id will be added.
     */
    protected void transformRepeatedNames(JSONObject fields) {
        Set uniqueNames = new TreeSet(fieldsName);
        fieldsName = new ArrayList();
        fieldsIdByName = new HashMap();
        fieldsNameById = new HashMap();

        if( objectiveFieldId == null ) {
            String name = Utils.getJSONObject(fields, objectiveFieldId + ".name").toString();
            fieldsName.add( name );
            fieldsIdByName.put(name, objectiveFieldId);
            fieldsIdByName.put(objectiveFieldId, name);
        }

        for (String fieldId : fieldsId) {
            if( objectiveFieldId != null && fieldId.equals(objectiveFieldId) ) {
                continue;
            }

            String name = Utils.getJSONObject(fields, fieldId + ".name").toString();
            int columnNumber = ((Number) Utils.getJSONObject(fields, fieldId + ".column_number")).intValue();
            if( fieldsName.contains(name) ) {
                name = String.format("%s%d", name, columnNumber);
                if( fieldsName.contains(name) ) {
                    name = String.format("%s_%d", name, fieldId);
                }

                ((JSONObject) fields.get(fieldId)).put("name", name);
            }
            uniqueNames.add(name);
            fieldsName.add(name);
            fieldsIdByName.put(name, fieldId);
            fieldsIdByName.put(fieldId, name);
        }
    }

    /**
     * Transforms to unicode and cleans missing tokens
     *
     * @param value the value to normalize
     */
    protected  T normalize(T value) {
//        if( value instanceof String ) {
            return (missingTokens.contains(value) ? null : value);
//        }

//        return null;
    }
    

//    /**
//     * Strips prefixes and suffixes if present
//     */
//    public Object stripAffixes(String value, JSONObject field) {
//
//        if( field.containsKey("prefix") &&
//                value.startsWith(field.get("prefix").toString()) ) {
//            value =  value.substring(field.get("prefix").toString().length(),
//                    value.length());
//        }
//
//        if( field.containsKey("suffix") &&
//                value.endsWith(field.get("suffix").toString()) ) {
//            value =  value.substring(0,
//                    value.length() - field.get("suffix").toString().length());
//        }
//
//        return value;
//    }
    
    /**
     * Parses the input data to find the list of unique terms in the
     * tag cloud
     */
    protected Map uniqueTerms(Map inputData) {
    	Map uniqueTerms = new HashMap();
        for (Object fieldId : termForms.keySet()) {
        	
            if( inputData.containsKey(fieldId.toString()) ) {
                Object inputDataField = inputData.get(fieldId.toString());
                inputDataField = (inputDataField != null ? inputDataField : "");

                if( inputDataField instanceof String ) {
                    boolean caseSensitive = (Boolean) Utils.getJSONObject(termAnalysis,
                            fieldId + ".case_sensitive", Boolean.TRUE);
                    String tokenMode = (String) Utils.getJSONObject(termAnalysis,
                            fieldId + ".token_mode", "all");

                    List terms = new ArrayList();
                    if( !Utils.TM_FULL_TERM.equals(tokenMode) ) {
                        terms = parseTerms(inputDataField.toString(), caseSensitive);
                    }

                    if( !Utils.TM_TOKENS.equals(tokenMode) ) {
                        terms.add((caseSensitive ? inputDataField.toString() :
                                ((String) inputDataField).toLowerCase()));
                    }
                    uniqueTerms.put(fieldId.toString(), uniqueTerms(terms,
                            (JSONObject) termForms.get(fieldId),
                            tagClouds.get(fieldId.toString())) );
                } else {
                    uniqueTerms.put(fieldId.toString(), inputDataField);
                }

                inputData.remove(fieldId.toString());
            }   
        }
        

        //the same for items fields
        for (Object fieldId : itemAnalysis.keySet()) {
        	
        	if( inputData.containsKey(fieldId.toString()) ) {
                Object inputDataField = inputData.get(fieldId.toString());
                inputDataField = (inputDataField != null ? inputDataField : "");
                
                if (inputDataField instanceof String) {
                	String separator = (String) Utils.getJSONObject(
                			itemAnalysis, fieldId + ".separator", " ");
                	String regexp = (String) Utils.getJSONObject(
                			itemAnalysis, fieldId + ".separator_regexp", "");
                	
                	if (regexp == null) {
                		regexp = StringEscapeUtils.escapeJava(separator);
                	}
                	if ("$".equals(regexp)) {
                		regexp = "\\$";
                	}
                	
                	List terms = parseItems(
                			inputDataField.toString(), regexp);
                	
                	uniqueTerms.put(fieldId.toString(), 
                			uniqueTerms(terms,
                						new JSONObject(),
                						items.get(fieldId.toString())) );
                	
                } else {
                    uniqueTerms.put(fieldId.toString(), inputDataField);
                }
                
                inputData.remove(fieldId.toString());
        	}
        }
        
    	for (Object fieldId : categories.keySet()) {
    		if (inputData.containsKey(fieldId.toString())) {
    			Object inputDataField = inputData.get(fieldId.toString());
                inputDataField = (inputDataField != null ? inputDataField : "");
                JSONObject data = new JSONObject();
                data .put(inputDataField, 1);
                uniqueTerms.put(fieldId.toString(), data);
                inputData.remove(fieldId.toString());
    		}
    		
    	}
    	
        return uniqueTerms;
    }
    
    /**
     * Extracts the unique terms that occur in one of the alternative forms in
     *  term_forms or in the tag cloud.
     */
    protected Map uniqueTerms(List terms, 
    		JSONObject termForms, List tagClouds) {
    	
    	Map extendForms = new HashMap();
    	for (Object term : termForms.keySet()) {
            JSONArray forms = (JSONArray) termForms.get(term);
            for (Object form : forms) {
                extendForms.put(form.toString(), term.toString());
            }
            extendForms.put(term.toString(), term.toString());
        }
    	
    	Map termsSet = new HashMap();
    	for (Object term : terms) {
    		
    		if( tagClouds.indexOf(term.toString()) != -1) {
    			if (!termsSet.containsKey(term.toString())) {
    				termsSet.put(term.toString(), 0);
    			}
    			Integer value = termsSet.get(term.toString());
    			termsSet.put(term.toString(), value+1);
    		} else if( extendForms.containsKey(term.toString()) ) {
    			term = extendForms.get(term.toString());
    			if (!termsSet.containsKey(term.toString())) {
    				termsSet.put(term.toString(), 0);
    			}
    			Integer value = termsSet.get(term.toString());
    			termsSet.put(term.toString(), value+1);
            }
    	}

        return termsSet;
    }

    /**
     * Returns the list of parsed terms
     */
    protected List parseTerms(String text, Boolean caseSensitive) {
        if( caseSensitive == null ) {
            caseSensitive = Boolean.TRUE;
        }

        List terms = new ArrayList();

        String expression = "(\\b|_)([^\b_\\s]+?)(\\b|_)";

        Pattern pattern = Pattern.compile(expression);
        Matcher matcher = pattern.matcher(text);
        // check all occurrence
        while (matcher.find()) {
            String term = matcher.group();
            terms.add( (caseSensitive ? term : term.toLowerCase()) );
        }

        return terms;
    }
    
    /**
     * Returns the list of parsed items
     */
    protected List parseItems(String text, String regexp) {
    	if (text != null) {
    		return Arrays.asList(text.split(regexp));   
    	}
    	return null;
    }
    
    
    public List getMissingTokens() {
        return missingTokens;
    }

    public JSONObject getFields() {
        return fields;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy