All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.boilerpipe.utils.ScentUtils Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.boilerpipe.utils;

import com.google.common.collect.ListMultimap;
import org.apache.commons.lang3.StringUtils;

import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by vincent on 16-10-27.
 * Copyright @ 2013-2016 Platon AI. All rights reserved
 */
public class ScentUtils {

  public static boolean checkFieldIsAPersonName(String fieldName) {
    return fieldName.equals("author") || fieldName.equals("director");
  }

  public static Map extract(String text, ListMultimap regexFieldRules) {
    return extract(text, regexFieldRules, 1, 2);
  }

  public static Map extract(String text, ListMultimap regexFieldRules, int keyGroup, int valueGroup) {
    Map results = new LinkedHashMap<>();

    for (Map.Entry rule : regexFieldRules.entries()) {
      String[] parts = extractToArray(text, Pattern.compile(rule.getValue()), keyGroup, valueGroup);
      if (parts[0].length() > 0 && parts[1].length() > 0) {
        String key = rule.getKey();
        String value = filterExtractedValue(key, parts[1]);

        if (value == null || value.isEmpty()) {
          continue;
        }

        // The value is a name, but the value does not like a name
        if (checkFieldIsAPersonName(key)) {
          if (Arrays.stream(BoiConstants.BAD_PHRASE_IN_NAME).anyMatch(value::contains)) {
            value = null;
          }
        }

        if (value == null || value.isEmpty()) {
          continue;
        }

        Integer maxLength = BoiConstants.MAX_FIELD_LENGTH_MAP.get(key);
        if (maxLength == null) {
          results.put(key, value);
        } else if (value.length() <= maxLength) {
          results.put(key, value);
        }
      }
    } // for

    return results;
  }

  private static String filterExtractedValue(String key, String value) {
    for (String bounder : BoiConstants.REGEX_FIELD_BOUNDERS) {
      if (value.endsWith(bounder)) {
        value = StringUtils.substringBefore(value, bounder);
        break;
      }
    }

    return value;
  }

  public static String[] extractToArray(String text, Pattern pattern) {
    return extractToArray(text, pattern, 1, 2);
  }

  public static String[] extractToArray(String text, Pattern pattern, int keyGroup, int valueGroup) {
    String[] parts = {"", ""};

    Matcher matcher = pattern.matcher(text);

    if (matcher.find()) {
      int groupCount = matcher.groupCount();
      if (keyGroup <= groupCount && valueGroup <= groupCount) {
        String k = matcher.group(keyGroup);
        String v = matcher.group(valueGroup);

        if (k != null && v != null) {
          parts[0] = k.trim();
          parts[1] = v.trim();
        }
      }
    }

    return parts;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy