All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.genmodel.mojopipeline.transformers.StringPropertiesUnaryTransform Maven / Gradle / Ivy

package hex.genmodel.mojopipeline.transformers;

import ai.h2o.mojos.runtime.api.backend.ReaderBackend;
import ai.h2o.mojos.runtime.frame.MojoFrame;
import ai.h2o.mojos.runtime.frame.MojoFrameMeta;
import ai.h2o.mojos.runtime.transforms.MojoTransform;
import ai.h2o.mojos.runtime.transforms.MojoTransformBuilderFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;

public class StringPropertiesUnaryTransform extends MojoTransform {

    StringPropertiesUnaryFunction _function;

    StringPropertiesUnaryTransform(int[] iindices, int[] oindices, StringPropertiesUnaryFunction function) {
        super(iindices, oindices);
        _function = function;
    }

    @Override
    public void transform(MojoFrame frame) {
        String[] a = (String[]) frame.getColumnData(iindices[0]);
        double[] o = (double[]) frame.getColumnData(oindices[0]);
        for (int i = 0, nrows = frame.getNrows(); i < nrows; i++) {
            o[i] = a[i] == null ? null : _function.call(a[i]);
        }
    }

   interface StringPropertiesUnaryFunction {
        void initialize(Map params);
        double call(String value);
    }

    public static class Factory implements MojoTransformBuilderFactory {
        
        private static final HashMap _supportedFunctions = new HashMap() {{
            put("countmatches", new StringPropertiesUnaryFunction() {
                String[] _pattern =  null;

                @Override
                public void initialize(Map params) {
                    Object patternObj = params.get("pattern");
                    if (patternObj == null) {
                        throw new IllegalArgumentException("The 'pattern' param is not passed to 'countmatches' function!");
                    }
                    if (patternObj instanceof String) {
                        _pattern = ((String)patternObj).split("`````");
                    } else {
                        throw new IllegalArgumentException(
                            String.format(
                                "The type '%s' of 'pattern' param is not supported.",
                                patternObj.getClass().getName()));
                    }
                }
                @Override
                public double call(String value) {
                    int count = 0;
                    for (String word : _pattern) {
                        count += StringUtils.countMatches(value, word);
                    }
                    return count;
                }
            });
            put("num_valid_substrings", new StringPropertiesUnaryFunction() {
                HashSet _words =  null;

                @Override
                public void initialize(Map params) {
                    Object wordsObj = params.get("words");
                    if (wordsObj == null) {
                        throw new IllegalArgumentException("The 'words' param is not passed to 'num_valid_substrings' function!");
                    }
                    String wordsPath = (String) wordsObj;
                    try {
                        _words = new HashSet<>(FileUtils.readLines(new File(wordsPath)));
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }
                @Override
                public double call(String value) {
                    int count = 0;
                    int N = value.length();
                    for (int i = 0; i < N - 1; i++)
                        for (int j = i + 2; j < N + 1; j++) {
                            if (_words.contains(value.substring(i, j)))
                                count += 1;
                        }
                    return count;
                }
            });
            put("entropy", new StringPropertiesUnaryFunction() {
                @Override
                public void initialize(Map params) {}
                @Override
                public double call(String value) {
                    HashMap freq = new HashMap<>();
                    for (int i = 0; i < value.length(); i++) {
                        char c = value.charAt(i);
                        Integer count = freq.get(c);
                        if (count == null) freq.put(c, 1);
                        else freq.put(c, count + 1);
                    }
                    double sume = 0;
                    int N = value.length();
                    double n;
                    for (char c : freq.keySet()) {
                        n = freq.get(c);
                        sume += -n / N * Math.log(n / N) / Math.log(2);
                    }
                    return sume;
                }
            });
            put("strlen", new StringPropertiesUnaryFunction() {
                @Override
                public void initialize(Map params) {
                }

                @Override
                public double call(String value) {
                    return value.length();
                }
            });
        }};

        public static final String TRANSFORMER_ID = "hex.genmodel.mojopipeline.transformers.StringPropertiesUnaryTransform";
        
        public static StringPropertiesUnaryFunction getFunction(String functionName) {
            final StringPropertiesUnaryFunction function = _supportedFunctions.get(functionName);
            if (function == null) {
                throw new UnsupportedOperationException(
                    String.format("The function '%s' is not supported unary string transformation.", functionName));
            }
            return function;
        }
        
        public static boolean functionExists(String functionName) {
            return _supportedFunctions.containsKey(functionName);
        }

        @Override
        public String transformerName() {
            return TRANSFORMER_ID;
        }

        @Override
        public MojoTransform createBuilder(MojoFrameMeta meta,
                                           int[] iindcies, int[] oindices,
                                           Map params,
                                           ReaderBackend backend) {
            final String functionName = (String) params.get("function");
            final StringPropertiesUnaryFunction function = Factory.getFunction(functionName);
            function.initialize(params);
            return new StringPropertiesUnaryTransform(iindcies, oindices, function);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy