All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensextant.output.GISDataModel Maven / Gradle / Ivy

There is a newer version: 3.7.3
Show newest version
/*
 *
 * Copyright 2012-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.output;

import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FilenameUtils;
import org.opensextant.ConfigException;
import org.opensextant.data.Geocoding;
import org.opensextant.extraction.ExtractionResult;
import org.opensextant.extraction.TextMatch;
import org.opensextant.giscore.events.Feature;
import org.opensextant.giscore.events.Schema;
import org.opensextant.giscore.events.SimpleField;
import org.opensextant.giscore.geometry.Point;
import org.opensextant.processing.ResultsUtility;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GISDataModel {

    protected final Logger log = LoggerFactory.getLogger(getClass());
    protected boolean includeOffsets = false;
    protected boolean includeCoordinate = true;
    protected boolean useFileHyperlink = false;

    protected Schema schema = null;
    protected List field_order = new ArrayList();
    public Set field_set = new HashSet();

    /**
     * Instantiates a new GIS data model.
     *
     * @param jobName the job name
     * @param includeOffsets the include offsets
     * @param includeCoordinate the include coordinate
     */
    public GISDataModel(String jobName, boolean includeOffsets, boolean includeCoordinate) {
        this(jobName, includeOffsets, includeCoordinate, true);
    }

    /**
     * Instantiates a new GIS data model.
     *
     * @param jobName the job name
     * @param includeOffsets the include offsets
     * @param includeCoordinate the include coordinate
     * @param buildSchema the build schema
     */
    public GISDataModel(String jobName, boolean includeOffsets, boolean includeCoordinate, boolean buildSchema) {
        super();
        this.includeOffsets = includeOffsets;
        this.includeCoordinate = includeCoordinate;
        if (buildSchema) {
            defaultFields();
            try {
                this.schema = buildSchema(jobName);
            } catch (ConfigException e) {
                // could not successfully construct the schema... fail hard.
                throw new RuntimeException(e);
            }
        }
    }

    /**
     * Adds the place data.
     *
     * @param row row of data
     * @param g geocoding
     */
    protected void addPlaceData(Feature row, Geocoding g) {
        addColumn(row, OpenSextantSchema.ISO_COUNTRY, g.getCountryCode());
        addColumn(row, OpenSextantSchema.PROVINCE, g.getAdmin1());
        addColumn(row, OpenSextantSchema.FEATURE_CLASS, g.getFeatureClass());
        addColumn(row, OpenSextantSchema.FEATURE_CODE, g.getFeatureCode());
        addColumn(row, OpenSextantSchema.PLACE_NAME, g.getPlaceName());

        if (includeCoordinate) {
            if (g.hasCoordinate()) {
                // Set the geometry to be a point, and add the feature to the list
                row.setGeometry(new Point(g.getLatitude(), g.getLongitude()));
                addLatLon(row, g);
            }
        }
    }

    /**
     * Adds the precision.
     *
     * @param row row of data
     * @param g geocoding
     */
    protected void addPrecision(Feature row, Geocoding g) {
        addColumn(row, OpenSextantSchema.PRECISION, g.getPrecision());
    }

    /**
     * Adds the confidence.
     *
     * @param row row of data
     * @param conf confidence
     */
    protected void addConfidence(Feature row, int conf) {
        addColumn(row, OpenSextantSchema.CONFIDENCE, conf);
    }

    /**
     * Adds the offsets.
     *
     * @param row data
     * @param m match metadata
     */
    protected void addOffsets(Feature row, TextMatch m) {
        addColumn(row, OpenSextantSchema.START_OFFSET, m.start);
        addColumn(row, OpenSextantSchema.END_OFFSET, m.end);
    }

    /**
     * Adds the lat lon. to the given data row.
     *
     * @param row data
     * @param g geocoding
     */
    protected void addLatLon(Feature row, Geocoding g) {
        addColumn(row, OpenSextantSchema.LAT, g.getLatitude());
        addColumn(row, OpenSextantSchema.LON, g.getLongitude());
    }

    /**
     * If the caller has additional data to attach to records, allow them to add
     * fields to schema at runtime and map their data to keys on GeocodingResult
     * 
     * Similarly, you could have Geocoding row-level attributes unique to the
     * geocoding whereas attrs on GeocodingResult are global for all geocodings
     * in that result set.
     *
     * @param row the row
     * @param rowAttributes the row attributes
     * @throws ConfigException the config exception
     */
    protected void addAdditionalAttributes(Feature row, Map rowAttributes) throws ConfigException {
        if (rowAttributes != null) {

            try {
                for (String field : rowAttributes.keySet()) {
                    if (log.isDebugEnabled()) {
                        log.debug("FIELD=" + field + " = " + rowAttributes.get(field));
                    }
                    addColumn(row, OpenSextantSchema.getField(field), rowAttributes.get(field));
                }
            } catch (ConfigException fieldErr) {
                throw fieldErr;
            }
        }
    }

    /**
     * Adds the file paths.
     *
     * @param row data
     * @param recordFile original file
     * @param recordTextFile text version of original
     */
    protected void addFilePaths(Feature row, String recordFile, String recordTextFile) {
        // TOOD: HPATH goes here.
        if (recordFile != null) {
            String fname = FilenameUtils.getBaseName(recordFile);
            addColumn(row, OpenSextantSchema.FILENAME, fname);
            if (this.useFileHyperlink) {
                // Caller is responsible for making sure recordFile is absolute path.
                addColumn(row, OpenSextantSchema.FILEPATH,
                        String.format("%s", recordFile, fname));
            } else {
                addColumn(row, OpenSextantSchema.FILEPATH, recordFile);
            }
            // Only add text path:
            // if original is not plaintext or
            // if original has not been converted
            //
            if (recordTextFile != null && !recordFile.equals(recordTextFile)) {
                addColumn(row, OpenSextantSchema.TEXTPATH, recordTextFile);
            }
        } else {
            log.error("No File path given");
        }
    }

    /**
     * Adds the context.
     *
     * @param row the row
     * @param g the g
     */
    protected void addContext(Feature row, TextMatch g) {
        addColumn(row, OpenSextantSchema.CONTEXT, g.getContext());
    }

    /**
     * Adds the match text.
     *
     * @param row the row
     * @param g the g
     */
    protected void addMatchText(Feature row, TextMatch g) {
        addColumn(row, OpenSextantSchema.MATCH_TEXT, g.getText());
    }

    /**
     * Allows caller to add a method or pattern id of sorts to denote how match
     * was derived.
     *
     * @param row the row
     * @param method the method
     */
    protected void addMatchMethod(Feature row, String method) {
        addColumn(row, OpenSextantSchema.MATCH_METHOD, method);
    }

    /**
     * Adds the match method.
     *
     * @param row the row
     * @param match the match
     */
    protected void addMatchMethod(Feature row, TextMatch match) {
        String method = match.getType();
        addColumn(row, OpenSextantSchema.MATCH_METHOD, method);
    }

    /**
     * Builds a GISCore feature array (rows) from a given array of TextMatches;
     * Enrich the features with record-level attributes (columns). If provided
     * result has .input set, then conext and other metadata for this match will
     * be pulled from it. Context is not pulled at match time, as it is not used
     * by most processing -- it tends to be more of an output/formatting issue.
     * And only matches that pass any filters are enriched with context and
     * other metadaa.
     *
     *
     * @param id the id
     * @param g the g
     * @param m the m
     * @param rowAttributes the row attributes
     * @param res the res
     * @return the list
     * @throws ConfigException schema configuration error
     */
    public List buildRows(int id, Geocoding g, TextMatch m, Map rowAttributes,
            ExtractionResult res) throws ConfigException {

        Feature row = new Feature();
        // Administrative settings:
        row.setName(g.getPlaceName());
        row.setSchema(schema.getId());
        row.putData(OpenSextantSchema.SCHEMA_OID, id);

        //
        if (includeOffsets) {
            addOffsets(row, m);
        }

        addPlaceData(row, g);
        addPrecision(row, g);
        addConfidence(row, g.getConfidence());

        if (m.getContext() == null && res.input != null) {
            int len = res.input.buffer.length();
            ResultsUtility.setContextFor(res.input.buffer, m, len);
        }
        addContext(row, m);

        addMatchText(row, m);
        addMatchMethod(row, g.getMethod());

        addAdditionalAttributes(row, rowAttributes);

        if (res.recordFile != null) {
            addFilePaths(row, res.recordFile, res.recordTextFile);
        }

        // this is a list for M x N times
        List features = new ArrayList();
        features.add(row);

        return features;

    }

    private static final DecimalFormat confFmt = new DecimalFormat("0.000");

    /**
     * Convenience method for managing how confidence number is reported in
     * output.
     *
     * @param conf the conf
     * @return the string
     */
    protected String formatConfidence(double conf) {
        return confFmt.format(conf);
    }

    /**
     * Gets the schema.
     *
     * @return the schema
     */
    public Schema getSchema() {
        return this.schema;
    }

    /**
     * Create a schema instance with the fields properly typed and ordered.
     *
     * @param jobName the job name
     * @return the schema
     * @throws ConfigException schema configuration error
     */
    protected Schema buildSchema(String jobName) throws ConfigException {

        if (this.schema != null) {
            return this.schema;
        }

        URI uri = null;
        try {
            uri = new URI("urn:OpenSextant");
        } catch (URISyntaxException e) {
            // e.printStackTrace();
        }

        this.schema = new Schema(uri);
        // Add ID field to the schema
        this.schema.put(OpenSextantSchema.SCHEMA_OID);
        this.schema.setName(jobName);

        for (String field : field_order) {

            if (!this.includeOffsets && (field.equals("start") || field.equals("end"))) {
                continue;
            }

            if (!this.includeCoordinate && (field.equals("lat") || field.equals("lon"))) {
                continue;
            }

            SimpleField F = getField(field);
            this.schema.put(F);
        }

        this.field_set.addAll(field_order);

        return this.schema;
    }

    /**
     * Gets the field.
     *
     * @param field the field
     * @return the field
     * @throws ConfigException the config exception
     */
    protected SimpleField getField(String field) throws ConfigException {
        return OpenSextantSchema.getField(field);
    }

    /**
     * Can add.
     *
     * @param f the f
     * @return true, if successful
     */
    protected boolean canAdd(SimpleField f) {
        if (f == null) {
            return false;
        }
        return field_set.contains(f.getName()) && (schema.get(f.getName()) != null);
    }

    /**
     * Add a column of data to output; Field is validated ; value is not added
     * if null.
     *
     * @param row the row
     * @param f the f
     * @param d the d
     */
    protected void addColumn(Feature row, SimpleField f, Object d) {
        if (d == null) {
            return;
        }
        if (canAdd(f)) {
            row.putData(f, d);
        }
    }

    /**
     * Add a column of data to output; Field is validated.
     *
     * @param row the row
     * @param f the f
     * @param d the d
     */
    protected void addColumn(Feature row, SimpleField f, int d) {
        if (canAdd(f)) {
            row.putData(f, d);
        }
    }

    /**
     * Add a column of data to output; Field is validated.
     *
     * @param row the row
     * @param f the field name
     * @param d value
     */
    protected void addColumn(Feature row, SimpleField f, double d) {
        if (canAdd(f)) {
            row.putData(f, d);
        }
    }

    /**
     * Add a field key to the field order; Caller must also be responsible for
     * ensuring field is valid and exists in Schema.
     *
     * @param fld field name
     * @throws ConfigException the config exception
     */
    public void addField(String fld) throws ConfigException {
        if (getField(fld) == null) {
            throw new ConfigException("Field is not defined in Schema");
        }
        field_order.add(fld);
    }

    /**
     * Removes the field.
     *
     * @param fld field name
     * @throws ConfigException the config exception
     */
    public void removeField(String fld) throws ConfigException {
        if (getField(fld) == null) {
            throw new ConfigException("Field is not defined in Schema; Cannot remove non-existing field");
        }
        field_order.remove(fld);
    }

    /**
     * Default fields.
     */
    protected final void defaultFields() {
        // ID occurs in all output.
        // id.

        // Matching data
        field_order.add("placename");

        // Geographic
        field_order.add("province");
        field_order.add("iso_cc");
        field_order.add("lat");
        field_order.add("lon");

        // Textual context.
        field_order.add("matchtext");
        field_order.add("context");
        field_order.add("filename");
        field_order.add("filepath");
        field_order.add("textpath");

        // File mechanics
        field_order.add("method");
        field_order.add("feat_class");
        field_order.add("feat_code");
        field_order.add("confidence");
        field_order.add("precision");
        field_order.add("start");
        field_order.add("end");
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy