All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hurence.logisland.processor.useragent.ParseUserAgent Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (C) 2016 Hurence ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.hurence.logisland.processor.useragent;

import com.hurence.logisland.annotation.documentation.*;
import com.hurence.logisland.classloading.PluginLoader;
import com.hurence.logisland.component.InitializationException;
import com.hurence.logisland.component.PropertyDescriptor;
import com.hurence.logisland.processor.*;
import com.hurence.logisland.record.Field;
import com.hurence.logisland.record.FieldDictionary;
import com.hurence.logisland.record.FieldType;
import com.hurence.logisland.record.Record;
import com.hurence.logisland.validator.StandardValidators;

import com.hurence.logisland.validator.ValidationResult;
import com.hurence.logisland.validator.Validator;
import nl.basjes.parse.useragent.UserAgent;
import nl.basjes.parse.useragent.UserAgentAnalyzer;
import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.stream.Collectors;

/**
 * HTTP user-agent processor
 */
@Category(ComponentCategory.ENRICHMENT)
@Tags({"User-Agent", "clickstream", "DMP"})
@CapabilityDescription(
        "The user-agent processor allows to decompose User-Agent value from an HTTP header into several attributes of interest."
                + " There is no standard format for User-Agent strings, hence it is not easily possible to use regexp to handle them."
                + " This processor rely on the `YAUAA library `_ to do the heavy work.")
@ExtraDetailFile("./details/ParseUserAgent-Detail.rst")
public class ParseUserAgent extends AbstractProcessor {

    private static final Object sync = new Object();

    private boolean debug;
    private String userAgentField;
    private boolean useCache;
    private boolean userAgentKeep;
    private int cacheSize;
    private List selectedFields;
    private boolean confidenceEnabled;
    private boolean ambiguityEnabled;

    private static final String KEY_DEBUG = "debug";
    private static final String KEY_CACHE_ENABLED = "cache.enabled";
    private static final String KEY_CACHE_SIZE = "cache.size";
    private static final String KEY_USERAGENT_FIELD = "useragent.field";
    private static final String KEY_USERAGENT_KEEP = "useragent.keep";
    private static final String KEY_FIELDS_TO_RETURN = "fields";
    private static final String KEY_CONFIDENCE_ENABLED = "confidence.enabled";
    private static final String KEY_AMBIGUITY_ENABLED = "ambiguity.enabled";

    //private static GenericObjectPool pool;

//    private static final List defaultFields = Arrays.asList(
//            "DeviceClass",
//            "DeviceName",
//            "DeviceBrand",
//            "DeviceFirmwareVersion",
//            "DeviceVersion",
//            "OperatingSystemClass",
//            "OperatingSystemName",
//            "OperatingSystemVersion",
//            "OperatingSystemNameVersion",
//            "LayoutEngineClass",
//            "LayoutEngineName",
//            "LayoutEngineVersion",
//            "LayoutEngineVersionMajor",
//            "LayoutEngineNameVersion",
//            "LayoutEngineNameVersionMajor",
//            "AgentClass",
//            "AgentName",
//            "AgentVersion",
//            "AgentVersionMajor",
//            "AgentNameVersion",
//            "AgentNameVersionMajor"
//            );

    private static final List defaultFields;

    static {
        //is there a better way?
        ClassLoader cl = PluginLoader.getRegistry().get(ParseUserAgent.class.getCanonicalName());
        ClassLoader current = Thread.currentThread().getContextClassLoader();
        //will never be null
        try {
            Thread.currentThread().setContextClassLoader(cl);
            defaultFields = new UserAgentAnalyzer().getAllPossibleFieldNamesSorted();
        } finally {
            Thread.currentThread().setContextClassLoader(current);
        }
    }


    public static final PropertyDescriptor DEBUG = new PropertyDescriptor.Builder()
            .name(KEY_DEBUG)
            .description("Enable debug.")
            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .required(false)
            .defaultValue("false")
            .build();

    // Use cache
    public static final PropertyDescriptor CACHE_ENABLED = new PropertyDescriptor.Builder()
            .name(KEY_CACHE_ENABLED)
            .description("Enable caching. Caching to avoid to redo the same computation for many identical User-Agent strings.")
            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .required(false)
            .defaultValue("true")
            .build();

    // Cache size
    public static final PropertyDescriptor CACHE_SIZE = new PropertyDescriptor.Builder()
            .name(KEY_CACHE_SIZE)
            .description("Set the size of the cache.")
            .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
            .required(false)
            .defaultValue("1000")
            .build();

    public static final PropertyDescriptor USERAGENT_FIELD = new PropertyDescriptor.Builder()
            .name(KEY_USERAGENT_FIELD)
            .description("Must contain the name of the field that contains the User-Agent value in the incoming record.")
            //.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
            .required(true)
            //.defaultValue("useragent") // TODO : define what should be the default value
            .build();

    // Removes original User-Agent
    public static final PropertyDescriptor USERAGENT_KEEP = new PropertyDescriptor.Builder()
            .name(KEY_USERAGENT_KEEP)
            .description("Defines if the field that contained the User-Agent must be kept or not in the resulting records.")
            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .required(false)
            .defaultValue("true")
            .build();

    // Report per field confidence
    public static final PropertyDescriptor CONFIDENCE_ENABLED = new PropertyDescriptor.Builder()
            .name(KEY_CONFIDENCE_ENABLED)
            .description("Enable confidence reporting. Each field will report a confidence attribute with a value comprised between 0 and 10000.")
            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .required(false)
            .defaultValue("false")
            .build();

    // Report ambiguity count
    public static final PropertyDescriptor AMBIGUITY_ENABLED = new PropertyDescriptor.Builder()
            .name(KEY_AMBIGUITY_ENABLED)
            .description("Enable ambiguity reporting. Reports a count of ambiguities.")
            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
            .required(false)
            .defaultValue("false")
            .build();

    // List of attributes to return
    public static final PropertyDescriptor FIELDS_TO_RETURN = new PropertyDescriptor.Builder()
            .name(KEY_FIELDS_TO_RETURN)
            .description("Defines the fields to be returned.")
            .addValidator(new Validator() {
                @Override
                public ValidationResult validate(final String subject, final String value) {

                    String reason = null;
                    try {
                        String[] fields = value.split(",");
                        for (String field : fields) {
                            String f = field.trim();
                            if (!defaultFields.contains(f)) {
                                reason += "The field " + f + " is not valid. ";
                            }
                        }
                    } catch (final Exception e) {
                        reason = "not a comma separated list";
                    }

                    return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build();
                }
            })
            .required(false)
            .defaultValue(String.join(", ", defaultFields))
            .build();

    // TODO :  add the following params
    // Resource file with regex ???

    // error if useragent field is missing true/false

    @Override
    public List getSupportedPropertyDescriptors() {

        final List descriptors = new ArrayList<>();
        descriptors.add(DEBUG);
        descriptors.add(CACHE_ENABLED);
        descriptors.add(CACHE_SIZE);
        descriptors.add(USERAGENT_FIELD);
        descriptors.add(USERAGENT_KEEP);
        descriptors.add(CONFIDENCE_ENABLED);
        descriptors.add(AMBIGUITY_ENABLED);
        descriptors.add(FIELDS_TO_RETURN);
        return Collections.unmodifiableList(descriptors);
    }


    @Override
    public void init(final ProcessContext context) throws InitializationException {
        super.init(context);
        getLogger().debug("Initializing User-Agent Processor");

        debug = context.getPropertyValue(DEBUG).asBoolean();
        userAgentField = context.getPropertyValue(USERAGENT_FIELD).asString();
        userAgentKeep = context.getPropertyValue(USERAGENT_KEEP).asBoolean();
        useCache = context.getPropertyValue(CACHE_ENABLED).asBoolean();
        cacheSize = context.getPropertyValue(CACHE_SIZE).asInteger();
        String tmp = context.getPropertyValue(FIELDS_TO_RETURN).asString();
        selectedFields = Arrays.asList(tmp.split(",")).stream().map(String::trim).collect(Collectors.toList());
        confidenceEnabled = context.getPropertyValue(CONFIDENCE_ENABLED).asBoolean();
        ambiguityEnabled = context.getPropertyValue(AMBIGUITY_ENABLED).asBoolean();

        if (debug) {
            getLogger().info(KEY_USERAGENT_FIELD + "\t: " + userAgentField);
            getLogger().info(KEY_USERAGENT_KEEP + "\t: " + userAgentKeep);
            getLogger().info(KEY_DEBUG + "\t: " + debug);
            getLogger().info(KEY_CACHE_ENABLED + "\t: " + useCache);
            getLogger().info(KEY_CACHE_SIZE + "\t: " + cacheSize);
            getLogger().info(KEY_FIELDS_TO_RETURN + "\t: " + selectedFields);
            getLogger().info(KEY_CONFIDENCE_ENABLED + "\t: " + confidenceEnabled);
            getLogger().info(KEY_AMBIGUITY_ENABLED + "\t: " + ambiguityEnabled);
        }

        if (Singleton.get() == null) {
            getLogger().debug("Initializing UserAgentAnalyzerPool");
            synchronized (sync) {
                if (Singleton.get() == null) {

                    GenericObjectPoolConfig config = new GenericObjectPoolConfig();

                    //config.setMaxIdle(1);
                    config.setMaxTotal(10);

                    //TestOnBorrow=true --> To ensure that we get a valid object from pool
                    //config.setTestOnBorrow(true);

                    //TestOnReturn=true --> To ensure that valid object is returned to pool
                    //config.setTestOnReturn(true);

                    PooledUserAgentAnalyzerFactory factory = new PooledUserAgentAnalyzerFactory(selectedFields, cacheSize);

                    UserAgentAnalyzerPool pool = new UserAgentAnalyzerPool(factory, config);
                    Singleton.set(pool);
                }
            }
        }
    }

    @Override
    public Collection process(ProcessContext context, Collection records) {
        if (debug) {
            getLogger().debug("User-Agent Processor records input: " + records);
        }
        UserAgentAnalyzerPool pool = null;
        UserAgentAnalyzer uaa = null;
        try {
            pool = (UserAgentAnalyzerPool) Singleton.get();
            getLogger().debug("borrow UserAgentAnalyzer from pool");
            uaa = pool.borrowObject();
            processRecords(records, uaa);
        } catch (Throwable t) {
            getLogger().error("Error retrieving user-agent-analyser from pool", t);
            for (Record record : records) {
                record.setStringField(FieldDictionary.RECORD_ERRORS, "Failure retrieving user-agent-analyser from pool");
            }
        } finally {
            if (pool != null && uaa != null) {
                pool.returnObject(uaa);
            }
        }

        if (debug) {
            getLogger().debug("User-Agent Processor records output: " + records);
        }
        return records;
    }

    private void processRecords(Collection records, UserAgentAnalyzer uaa) {
        for (Record record : records) {

            Field uaField = record.getField(userAgentField);
            if (uaField == null) {
                getLogger().debug("Skipping record. Field '" + userAgentField + "' does not exists in record");
                continue;
            }

            String recordValue = (String) uaField.getRawValue();


            UserAgent agent;
            try {
                agent = uaa.parse(recordValue);
            } catch (Throwable t) {
                record.setStringField(FieldDictionary.RECORD_ERRORS, "Failure in User-agent decoding");
                getLogger().error("Cannot parse User-Agent content: " + record, t);
                continue;
            }

            for (String field : selectedFields) {
                String value = agent.getValue(field);
                if (value != null && !value.isEmpty()) {
                    record.setStringField(field, value);
                }
                if (confidenceEnabled) {
                    record.setField(new Field(field + ".confidence", FieldType.LONG, agent.getConfidence(field)));
                }
            }

            if (ambiguityEnabled) {
                record.setField(new Field("ambiguity", FieldType.INT, agent.getAmbiguityCount()));
            }


            if (!userAgentKeep) {
                record.removeField(userAgentField);
            }
        }
    }

    // TODO :
//    @Override
//    public void onPropertyModified(PropertyDescriptor descriptor, String oldValue, String newValue) {
//
//        logger.debug("property {} value changed from {} to {}", descriptor.getName(), oldValue, newValue);
//
//        /**
//         * Handle the debug property
//         */
//        if (descriptor.getName().equals(KEY_DEBUG))
//        {
//          if (newValue != null)
//          {
//              if (newValue.equalsIgnoreCase("true"))
//              {
//                  debug = true;
//              }
//          } else
//          {
//              debug = false;
//          }
//        }
//    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy