org.bigml.binding.ModelFields Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigml-binding Show documentation
Show all versions of bigml-binding Show documentation
An open source Java client that gives you a simple binding to interact with BigML. You can use it to
easily create, retrieve, list, update, and delete BigML resources.
package org.bigml.binding;
import org.apache.commons.text.StringEscapeUtils;
import org.bigml.binding.utils.Utils;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A ModelFields resource.
*
* This module defines a ModelFields class to hold the information
* associated to the fields of the model resource in BigML.
* It becomes the starting point for the Model class, that is used for
* local predictions.
*
*/
public class ModelFields implements Serializable {
private static final long serialVersionUID = 1L;
// Logging
Logger LOGGER = LoggerFactory.getLogger(ModelFields.class);
private static String DEFAULT_LOCALE = "en_US.UTF-8";
public static String[] DEFAULT_MISSING_TOKENS = Fields.DEFAULT_MISSING_TOKENS;
public static HashMap FIELDS_PARENT =
new HashMap();
static {
FIELDS_PARENT.put("cluster", "clusters");
FIELDS_PARENT.put("logisticregression", "logistic_regression");
FIELDS_PARENT.put("ensemble", "ensemble");
FIELDS_PARENT.put("deepnet", "deepnet");
}
protected String objectiveFieldId;
protected String objectiveFieldName;
protected List fieldsName;
protected List fieldsId;
protected Map fieldsIdByName;
protected Map fieldsNameById;
protected List missingTokens;
protected JSONObject fields = null;
protected JSONObject invertedFields = null;
protected String dataLocale = null;
protected Boolean missingNumerics = null;
protected JSONObject termForms = new JSONObject();
protected Map> tagClouds =
new HashMap>();
protected JSONObject termAnalysis = new JSONObject();
protected JSONObject itemAnalysis = new JSONObject();
protected Map> items =
new HashMap>();
protected JSONObject categories = new JSONObject();
protected JSONObject numericFields = new JSONObject();
/**
* The constructor can be instantiated with nothing inside.
*
* We will need to invoke the initialize in overridden classes
*/
protected ModelFields() {
}
/**
* The constructor can be instantiated with the fields structure.
* The structure is checked and fields structure is returned if a resource type is matched.
*
* @param fields the resource that hold the fields structure
*/
public ModelFields(JSONObject fields) {
initialize(fields, null, null, null);
}
/**
* The constructor can be instantiated with the fields structure.
* The structure is checked and fields structure is returned if a resource type is matched.
*
* @param fields the resource that hold the fields structure
*/
public ModelFields(JSONObject fields, String objectiveFieldId, String dataLocale,
List missingTokens) {
initialize(fields, objectiveFieldId, dataLocale, missingTokens);
}
/**
* The constructor can be instantiated with fields structure.
*
* @param fields the fields structure itself
* @param objectiveFieldId the ID of the objective field
* @param missingTokens the list of missing tokens to use. DEFAULT_MISSING_TOKENS will be used by default
* @param dataLocale the locale of the data
*/
protected void initialize(
JSONObject fields, String objectiveFieldId, String dataLocale,
List missingTokens) {
initialize(fields, objectiveFieldId, dataLocale, missingTokens,
false, false, false);
}
/**
* The constructor can be instantiated with fields structure.
*
* @param fields the fields structure itself
* @param objectiveFieldId the ID of the objective field
* @param missingTokens the list of missing tokens to use. DEFAULT_MISSING_TOKENS will be used by default
* @param dataLocale the locale of the data
*/
protected void initialize(JSONObject fields, String objectiveFieldId,
String dataLocale, List missingTokens, Boolean terms,
Boolean categories, Boolean numerics) {
this.fields = new JSONObject();
this.fields.putAll(fields);
this.objectiveFieldId = objectiveFieldId;
if( this.objectiveFieldId != null ) {
this.objectiveFieldName = Utils.getJSONObject(
fields, objectiveFieldId + ".name").toString();
}
uniquifyNames(this.fields);
this.invertedFields = Utils.invertDictionary(fields, "name");
this.missingTokens = missingTokens;
if( this.missingTokens == null ) {
this.missingTokens = new ArrayList(
Arrays.asList(DEFAULT_MISSING_TOKENS));
}
this.dataLocale = dataLocale;
if( this.dataLocale == null ) {
this.dataLocale = DEFAULT_LOCALE;
}
if (categories) {
this.categories = new JSONObject();
}
if (terms || categories || numerics) {
addTerms(categories, numerics);
}
}
/**
* Adds the terms information of text and items fields
*
*/
private void addTerms(boolean categories, boolean numerics) {
for (Object fieldId : fields.keySet()) {
JSONObject field = (JSONObject) fields.get(fieldId);
if ("text".equals(field.get("optype"))) {
termForms.put(fieldId,
Utils.getJSONObject(field, "summary.term_forms", new JSONObject()));
List fieldTagClouds = new ArrayList();
JSONArray tags = (JSONArray) Utils.getJSONObject(field, "summary.tag_cloud", new JSONArray());
for (Object tag : tags) {
JSONArray tagArr = (JSONArray) tag;
fieldTagClouds.add(tagArr.get(0).toString());
}
tagClouds.put(fieldId.toString(), fieldTagClouds);
termAnalysis.put(fieldId, Utils.getJSONObject(field, "term_analysis", new JSONObject()));
}
if ("items".equals(field.get("optype"))) {
List fieldItems = new ArrayList();
JSONArray itemsArray = (JSONArray) Utils.getJSONObject(field, "summary.items", new JSONArray());
for (Object item : itemsArray) {
JSONArray itemArr = (JSONArray) item;
fieldItems.add(itemArr.get(0).toString());
}
items.put(fieldId.toString(), fieldItems);
itemAnalysis.put(fieldId, Utils.getJSONObject(field, "item_analysis", new JSONObject()));
}
if (categories && "categorical".equals(field.get("optype"))) {
JSONArray cats = (JSONArray) Utils.getJSONObject(
field, "summary.categories", new JSONArray());
JSONArray categoriesList = new JSONArray();
for (Object category : cats) {
categoriesList.add(((JSONArray) category).get(0));
}
this.categories.put(fieldId, categoriesList);
}
if (numerics && this.missingNumerics != null &&
"numeric".equals(field.get("optype"))) {
this.numericFields.put(fieldId, true);
}
}
}
/**
* Checks the model structure to see if it contains all the needed keys
*
*/
protected boolean checkModelStructure(JSONObject model) {
return checkModelStructure(model, "model");
}
/**
* Checks the model structure to see if it contains all the needed keys
*/
protected boolean checkModelStructure(JSONObject model, String innerKey) {
return model.containsKey("resource") &&
model.get("resource") != null &&
(model.containsKey("object") &&
Utils.getJSONObject(model, "object." + innerKey, null) != null ||
model.containsKey(innerKey) );
}
/**
* Checks the model structure to see whether it contains the required
* fields information
*/
protected boolean checkModelFields(JSONObject model) {
if (!model.containsKey("resource") ||
model.get("resource") == null) {
return false;
}
String resource = (String) model.get("resource");
String innerKey = "model";
if (FIELDS_PARENT.containsKey(resource.split("/")[0])) {
innerKey = FIELDS_PARENT.get(resource.split("/")[0]);
}
if (checkModelStructure(model, innerKey)) {
model = (JSONObject) Utils.getJSONObject(model, "object", model);
JSONObject modelObj = (JSONObject) Utils.getJSONObject(
model, innerKey, new JSONObject());
JSONObject fields = (JSONObject) Utils.getJSONObject(
model, "fields", modelObj.get("fields"));
// models only need model_fields to work. The rest of
// resources will need all fields to work
JSONObject modelFields = (JSONObject) modelObj.get("model_fields");
if (modelFields == null) {
JSONObject fieldsMeta = (JSONObject) Utils.getJSONObject(
model, "fields_meta", modelObj.get("fields_meta"));
try {
return fieldsMeta.get("count") == fieldsMeta.get("total");
} catch (Exception e) {
// stored old models will not have the fields_meta info,
// sowe return True to avoid failing in this case
return true;
}
} else {
if (fields == null) {
return false;
}
Iterator iter = modelFields.keySet().iterator();
while (iter.hasNext()) {
String key = (String) iter.next();
if (!fields.containsKey(key)) {
return false;
}
}
return true;
}
}
return false;
}
/**
* Filters the keys given in input_data checking against model fields.
*
* @param inputData
*/
protected JSONObject filterInputData(JSONObject inputData) {
JSONObject filteredInputData = filterInputData(inputData, false);
return (JSONObject) filteredInputData.get("newInputData");
}
/**
* Filters the keys given in input_data checking against model fields.
*
* If `addUnusedFields` is set to True, it also provides
* information about the ones that are not used.
*
* @param inputData
* @param addUnusedFields
*/
protected JSONObject filterInputData(JSONObject inputData,
Boolean addUnusedFields) {
if (addUnusedFields == null) {
addUnusedFields = false;
}
// remove all missing values
Iterator fieldIdItr = inputData.keySet().iterator();
while(fieldIdItr.hasNext()) {
String fieldId = fieldIdItr.next();
Object value = inputData.get(fieldId);
value = normalize(value);
if( value == null ) {
fieldIdItr.remove();
}
}
JSONObject newInputData = new JSONObject();
List unusedFields = new ArrayList();
for (Object fieldId : inputData.keySet()) {
Object value = inputData.get(fieldId);
if( fieldsIdByName.containsKey(fieldId) ) {
fieldId = fieldsIdByName.get(fieldId.toString());
}
if( fieldsId.contains(fieldId) &&
(objectiveFieldId == null ||
!fieldId.equals(objectiveFieldId)) ) {
newInputData.put(fieldId, value);
} else {
unusedFields.add((String) fieldId);
}
}
JSONObject result = new JSONObject();
result.put("newInputData", newInputData);
result.put("unusedFields", unusedFields);
return result;
}
/**
* Tests if the fields names are unique. If they aren't, a
* transformation is applied to ensure unicity.
*/
protected void uniquifyNames(JSONObject fields) {
fieldsName = new ArrayList(fields.size());
fieldsId = new ArrayList(fields.size());
fieldsIdByName = new HashMap();
fieldsNameById = new HashMap();
for (Object fieldId : fields.keySet()) {
fieldsId.add(fieldId.toString());
String name = Utils.getJSONObject((JSONObject)
fields.get(fieldId), "name").toString();
fieldsName.add(name);
fieldsIdByName.put(name, fieldId.toString());
fieldsNameById.put(fieldId.toString(), name);
}
Set uniqueNames = new TreeSet(fieldsName);
if( uniqueNames.size() < fieldsName.size() ) {
transformRepeatedNames(fields);
}
}
/**
* If a field name is repeated, it will be transformed adding its
* column number. If that combination is also a field name, the
* field id will be added.
*/
protected void transformRepeatedNames(JSONObject fields) {
Set uniqueNames = new TreeSet(fieldsName);
fieldsName = new ArrayList();
fieldsIdByName = new HashMap();
fieldsNameById = new HashMap();
if( objectiveFieldId == null ) {
String name = Utils.getJSONObject(fields, objectiveFieldId + ".name").toString();
fieldsName.add( name );
fieldsIdByName.put(name, objectiveFieldId);
fieldsIdByName.put(objectiveFieldId, name);
}
for (String fieldId : fieldsId) {
if( objectiveFieldId != null && fieldId.equals(objectiveFieldId) ) {
continue;
}
String name = Utils.getJSONObject(fields, fieldId + ".name").toString();
int columnNumber = ((Number) Utils.getJSONObject(fields, fieldId + ".column_number")).intValue();
if( fieldsName.contains(name) ) {
name = String.format("%s%d", name, columnNumber);
if( fieldsName.contains(name) ) {
name = String.format("%s_%d", name, fieldId);
}
((JSONObject) fields.get(fieldId)).put("name", name);
}
uniqueNames.add(name);
fieldsName.add(name);
fieldsIdByName.put(name, fieldId);
fieldsIdByName.put(fieldId, name);
}
}
/**
* Transforms to unicode and cleans missing tokens
*
* @param value the value to normalize
*/
protected T normalize(T value) {
// if( value instanceof String ) {
return (missingTokens.contains(value) ? null : value);
// }
// return null;
}
// /**
// * Strips prefixes and suffixes if present
// */
// public Object stripAffixes(String value, JSONObject field) {
//
// if( field.containsKey("prefix") &&
// value.startsWith(field.get("prefix").toString()) ) {
// value = value.substring(field.get("prefix").toString().length(),
// value.length());
// }
//
// if( field.containsKey("suffix") &&
// value.endsWith(field.get("suffix").toString()) ) {
// value = value.substring(0,
// value.length() - field.get("suffix").toString().length());
// }
//
// return value;
// }
/**
* Parses the input data to find the list of unique terms in the
* tag cloud
*/
protected Map uniqueTerms(Map inputData) {
Map uniqueTerms = new HashMap();
for (Object fieldId : termForms.keySet()) {
if( inputData.containsKey(fieldId.toString()) ) {
Object inputDataField = inputData.get(fieldId.toString());
inputDataField = (inputDataField != null ? inputDataField : "");
if( inputDataField instanceof String ) {
boolean caseSensitive = (Boolean) Utils.getJSONObject(termAnalysis,
fieldId + ".case_sensitive", Boolean.TRUE);
String tokenMode = (String) Utils.getJSONObject(termAnalysis,
fieldId + ".token_mode", "all");
List terms = new ArrayList();
if( !Utils.TM_FULL_TERM.equals(tokenMode) ) {
terms = parseTerms(inputDataField.toString(), caseSensitive);
}
if( !Utils.TM_TOKENS.equals(tokenMode) ) {
terms.add((caseSensitive ? inputDataField.toString() :
((String) inputDataField).toLowerCase()));
}
uniqueTerms.put(fieldId.toString(), uniqueTerms(terms,
(JSONObject) termForms.get(fieldId),
tagClouds.get(fieldId.toString())) );
} else {
uniqueTerms.put(fieldId.toString(), inputDataField);
}
inputData.remove(fieldId.toString());
}
}
//the same for items fields
for (Object fieldId : itemAnalysis.keySet()) {
if( inputData.containsKey(fieldId.toString()) ) {
Object inputDataField = inputData.get(fieldId.toString());
inputDataField = (inputDataField != null ? inputDataField : "");
if (inputDataField instanceof String) {
String separator = (String) Utils.getJSONObject(
itemAnalysis, fieldId + ".separator", " ");
String regexp = (String) Utils.getJSONObject(
itemAnalysis, fieldId + ".separator_regexp", "");
if (regexp == null) {
regexp = StringEscapeUtils.escapeJava(separator);
}
if ("$".equals(regexp)) {
regexp = "\\$";
}
List terms = parseItems(
inputDataField.toString(), regexp);
uniqueTerms.put(fieldId.toString(),
uniqueTerms(terms,
new JSONObject(),
items.get(fieldId.toString())) );
} else {
uniqueTerms.put(fieldId.toString(), inputDataField);
}
inputData.remove(fieldId.toString());
}
}
for (Object fieldId : categories.keySet()) {
if (inputData.containsKey(fieldId.toString())) {
Object inputDataField = inputData.get(fieldId.toString());
inputDataField = (inputDataField != null ? inputDataField : "");
JSONObject data = new JSONObject();
data .put(inputDataField, 1);
uniqueTerms.put(fieldId.toString(), data);
inputData.remove(fieldId.toString());
}
}
return uniqueTerms;
}
/**
* Extracts the unique terms that occur in one of the alternative forms in
* term_forms or in the tag cloud.
*/
protected Map uniqueTerms(List terms,
JSONObject termForms, List tagClouds) {
Map extendForms = new HashMap();
for (Object term : termForms.keySet()) {
JSONArray forms = (JSONArray) termForms.get(term);
for (Object form : forms) {
extendForms.put(form.toString(), term.toString());
}
extendForms.put(term.toString(), term.toString());
}
Map termsSet = new HashMap();
for (Object term : terms) {
if( tagClouds.indexOf(term.toString()) != -1) {
if (!termsSet.containsKey(term.toString())) {
termsSet.put(term.toString(), 0);
}
Integer value = termsSet.get(term.toString());
termsSet.put(term.toString(), value+1);
} else if( extendForms.containsKey(term.toString()) ) {
term = extendForms.get(term.toString());
if (!termsSet.containsKey(term.toString())) {
termsSet.put(term.toString(), 0);
}
Integer value = termsSet.get(term.toString());
termsSet.put(term.toString(), value+1);
}
}
return termsSet;
}
/**
* Returns the list of parsed terms
*/
protected List parseTerms(String text, Boolean caseSensitive) {
if( caseSensitive == null ) {
caseSensitive = Boolean.TRUE;
}
List terms = new ArrayList();
String expression = "(\\b|_)([^\b_\\s]+?)(\\b|_)";
Pattern pattern = Pattern.compile(expression);
Matcher matcher = pattern.matcher(text);
// check all occurrence
while (matcher.find()) {
String term = matcher.group();
terms.add( (caseSensitive ? term : term.toLowerCase()) );
}
return terms;
}
/**
* Returns the list of parsed items
*/
protected List parseItems(String text, String regexp) {
if (text != null) {
return Arrays.asList(text.split(regexp));
}
return null;
}
public List getMissingTokens() {
return missingTokens;
}
public JSONObject getFields() {
return fields;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy