com.smartlogic.classificationserver.client.AuditFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Semaphore-CS-Client Show documentation
Show all versions of Semaphore-CS-Client Show documentation
Client for the Smartlogic Semaphore Classification Server
package com.smartlogic.classificationserver.client;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.ibm.icu.text.SimpleDateFormat;
/**
* The Audit Format that will be used to return the data from the Classification History method.
*
* Note that any fields that are not configured (within the Classification Server configuration) to be stored
* in the logs will not be returned by the Audit Format, in this case null values will be returned when requested.
* @author Smartlogic Semaphore
*
*/
public class AuditFormat {
protected final Log logger = LogFactory.getLog(getClass());
public final static String defaultDateFormat = "eee LLL dd hh:mm:ss yyyy";
public final static int UNDEFINED_INT = -1;
public final static float UNDEFINED_FLOAT = -1.0f;
public final static boolean UNDEFINED_BOOLEAN = false;
public final static Date UNDEFINED_DATE = new Date(0);
private enum DataField {
STARTTIME, // The time that the request started processing
FINISHTIME, // The time that the request finished processing
SOURCEIP, // The ip address of the client sending the request
PROTOCOLUSED, // The protocol used by the client to send the request
SCORES, // A comma separated list of document classification scores in format Class.Name:Score
ERRORNUM, // The error number for result (is 0 for success)
ERRORCOMPONENT, // The component reporting an error (if any)
ERRORMESSAGE, // The error message reported (if any)
OPERATION, // The operation type for the request
FILENAME, // The name of the file (if specified)
THRESHOLD, // The threshold used for the classification
TITLE, // The title of the document if specified in the request
SERVERID, // The id of the server performing the operation
SINGLEARTICLEMODE, // SingleArticle or MultiArticle as set in the request
LEGACYMODE, // Legacy or Normal as set in request
CLUSTERINGTYPE, // Type of clustering to use
CLUSTERINGTHRESHOLD, // Clustering threshold
DOCUMENTSCORELIMIT, // Limit on number of scores at document level
DIAGNOSTICSMODE, // Diagnostics or No Diagnostics as appropriate
FEEDBACKMODE, // Feedback of No Feedback as appropriate
TIMETAKEN, // Time taken (in milliseconds) for request
DOCUMENTHASH, // The MD5 hash of the document text (including META data from request)
AUDITTAG, // The optional Audit Tag specified in request
URL, // The url that data retrieved from (if specified)
META_ORIGINALURI // The meta supplied while classfication (the real file name)
}
private static final Map lookup = new HashMap();
static {
for (DataField dataField: EnumSet.allOf(DataField.class)){
lookup.put(dataField.name(), dataField);
}
}
private final Map fieldPositions = new HashMap();
private final Map fieldFormats = new HashMap(); // Cannot map from field name, 'cos the same field might be in twice
private final Map fixedFields = new HashMap();
/**
* Create the Audit Format from the string returned within the history request
* @param formatStringArray String array containing format
* @throws CSDataFormatException Data format exception
*/
public AuditFormat(String[] formatStringArray) throws CSDataFormatException {
for (int fieldIndex = 0; fieldIndex < formatStringArray.length; fieldIndex++) {
String dataFieldName = formatStringArray[fieldIndex].trim();
if (dataFieldName.startsWith("$(") && dataFieldName.endsWith(")")) {
dataFieldName = dataFieldName.substring(2, dataFieldName.length()-1);
// Is there a : format clause to this column?
int colonPos;
if ((colonPos = dataFieldName.indexOf("::")) != -1) {
String fieldFormat = dataFieldName.substring(colonPos + 2);
if (fieldFormat.startsWith("\"")) fieldFormat = fieldFormat.substring(1);
if (fieldFormat.endsWith("\"")) fieldFormat = fieldFormat.substring(0, fieldFormat.length()-1);
fieldFormats.put(fieldIndex, fieldFormat);
dataFieldName = dataFieldName.substring(0, colonPos);
}
DataField dataField = lookup.get(dataFieldName);
if (logger.isDebugEnabled())
logger.debug("Data field name: " + dataFieldName + " Data field: " + dataField + " fieldIndex: " + fieldIndex);
if (dataField == null){
logger.warn("Unrecognized data field \"" + dataFieldName + "\" returned in CS format string");
} else {
fieldPositions.put(dataField, fieldIndex);
}
} else {
// Keep a record of the fixed fields expected by this format
fixedFields.put(fieldIndex, dataFieldName);
}
}
}
/**
* Is the data line in the format expected for this AuditFormat. This is tested by checking the fixed format fields to see if they match.
* If they match, then it is assume that the data is of this format.
* @param data The data line containing the audit for this record
* @return True if data matches the AuditFormat
*/
public boolean isExample(String[] data) {
for (int fieldIndex: fixedFields.keySet()) {
if (data.length < fieldIndex+1) return false; // This data line is too short to be of this format
if (data[fieldIndex] == null) return false; // There's nothing in this column, but there should be
if (!data[fieldIndex].trim().equals(fixedFields.get(fieldIndex))) return false; // There's the wrong data in this column
}
return true;
}
/**
* Get the tag attached at to the request at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getAuditTag(String[] data) {
return getStringField(DataField.AUDITTAG, data);
}
/**
* Get the clustering threshold used at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public float getClusteringThreshold(String data[]){
return getFloatField(DataField.CLUSTERINGTHRESHOLD, data);
}
/**
* Get the clustering type used at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getClusteringType(String[] data) {
return getStringField(DataField.CLUSTERINGTYPE, data);
}
/**
* Get the diagnostics mode in use at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getDiagnosticsMode(String[] data) {
return getStringField(DataField.DIAGNOSTICSMODE, data);
}
/**
* Get the hash of the document as calculated at classification time. This
* can be used to identify document changes
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getDocumentHash(String[] data) {
return getStringField(DataField.DOCUMENTHASH, data);
}
/**
* Get the document score limit (i.e. the maximum number of tags
* that can be returned) in place at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public Float getDocumentScoreLimit(String[] data) {
return getFloatField(DataField.DOCUMENTSCORELIMIT, data);
}
/**
* For an error record, return the component in error
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getErrorComponent(String[] data) {
return getStringField(DataField.ERRORCOMPONENT, data);
}
/**
* For an error record, return the message associated with the error
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getErrorMessage(String[] data) {
return getStringField(DataField.ERRORMESSAGE, data);
}
/**
* For an error record, return the number of that error
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public int getErrorNum(String[] data) {
return getIntField(DataField.ERRORNUM, data);
}
/**
* Return the feedback mode in place at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getFeedbackMode(String[] data) {
return getStringField(DataField.FEEDBACKMODE, data);
}
/**
* Get the name of the file actually classified
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getFileName(String[] data) {
return getStringField(DataField.FILENAME, data);
}
/**
* Get the time at which classification was completed
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public Date getFinishDateTime(String[] data) {
return getDateField(DataField.FINISHTIME, data);
}
/**
* Get the legacy mode flag in place at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getLegacyMode(String[] data) {
return getStringField(DataField.LEGACYMODE, data);
}
/**
* Get the name of operation (this is pretty much redudant for classification requests)
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getOperation(String[] data) {
return getStringField(DataField.OPERATION, data);
}
/**
* Get the protocol used for the submission of the document
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getProtocolUsed(String[] data) {
return getStringField(DataField.PROTOCOLUSED, data);
}
/**
* Return the identity of the classification server
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getServerId(String[] data) {
return getStringField(DataField.SERVERID, data);
}
/**
* Get the status of the single article flag at classification time
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getSingleArticleMode(String[] data) {
return getStringField(DataField.SINGLEARTICLEMODE, data);
}
/**
* Get the IP address of the requestor for the classification
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getSourceIP(String[] data) {
return getStringField(DataField.SOURCEIP, data);
}
/**
* Get the time of the start of the classification process
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public Date getStartDateTime(String[] data) {
return getDateField(DataField.STARTTIME, data);
}
/**
* Get the threshold used for the classification request
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public float getThreshold(String data[]){
return getFloatField(DataField.THRESHOLD, data);
}
/**
* Get the time take to classify the document
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public float getTimeTaken(String data[]){
return getFloatField(DataField.TIMETAKEN, data);
}
/**
* Get the title of the document as extracted by the classification server
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getTitle(String[] data) {
return getStringField(DataField.TITLE, data);
}
/**
* Get the URL of the submitted document
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getURL(String[] data) {
return getStringField(DataField.URL, data);
}
/**
* Get the original uri of the submitted document
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public String getOriginalURI(String[] data) {
return getStringField(DataField.META_ORIGINALURI, data);
}
/**
* Return the scores extracted from the data line. One ClassificationScore object is returned
* for each tag returned by Classification Server
* @param data The data line containing the audit for this record
* @return The value from the data line
*/
public Collection getClassificationScores(String[] data) {
Collection classificationScores = new ArrayList();
Integer startColumn = fieldPositions.get(DataField.SCORES);
if (startColumn != null) {
for (int col = startColumn; col < data.length; col++) {
try {
classificationScores.add(new ClassificationScore(data[col]));
} catch (NotAScoreException e) {
break;
}
}
}
return classificationScores;
}
private float getFloatField(DataField dataField, String[] data) {
String stringValue = getStringField(dataField, data);
try {
return (stringValue == null) ? UNDEFINED_FLOAT : Float.parseFloat(stringValue);
} catch (NumberFormatException e) {
return UNDEFINED_FLOAT;
}
}
private int getIntField(DataField dataField, String[] data) {
String stringValue = getStringField(dataField, data);
try {
return (stringValue == null) ? UNDEFINED_INT : Integer.parseInt(stringValue);
} catch (NumberFormatException e) {
logger.warn("NumberFormatException in Integer field: " + e.getMessage());
return UNDEFINED_INT;
}
}
private Date getDateField(DataField dataField, String[] data) {
String stringValue = getStringField(dataField, data);
if (stringValue == null) return null;
String dateFormat = fieldFormats.get(fieldPositions.get(dataField));
if (dateFormat == null) dateFormat = defaultDateFormat;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(dateFormat);
try {
return simpleDateFormat.parse(stringValue);
} catch (ParseException e) {
logger.warn("ParseException in Date field (" + dateFormat + "): " + e.getMessage());
return UNDEFINED_DATE;
}
}
private String getStringField(DataField dataField, String[] data) {
Integer column = fieldPositions.get(dataField);
return (column == null) ? null : data[column];
}
@Override
public String toString() {
StringBuilder stringBuilder = new StringBuilder("Field Positions\n");
for (DataField dataField: fieldPositions.keySet()) {
stringBuilder.append(" " + dataField.name() + ":" + fieldPositions.get(dataField) + "\n");
}
stringBuilder.append("\nField Formats\n");
for (Integer column: fieldFormats.keySet()) {
stringBuilder.append(" " + column + ":" + fieldFormats.get(column) + "\n");
}
stringBuilder.append("\nFixed Fields\n");
for (Integer column: fixedFields.keySet()) {
stringBuilder.append(" " + column + ":" + fixedFields.get(column) + "\n");
}
return stringBuilder.toString();
}
}