All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.ac.shef.dcs.sti.parser.ContentValidator Maven / Gradle / Ivy

The newest version!
package uk.ac.shef.dcs.sti.parser;

/**
 * Author: Ziqi Zhang ([email protected])
 * Date: 10/10/12
 * Time: 15:50
 */
public abstract class ContentValidator {

    protected static final String[] ORDINALS = {"first", "1st", "second", "2nd", "third", "3rd", "fourth", "4th",
            "fifth", "5th", "sixth", "6th", "seventh", "7th", "eighth", "8th", "ninth", "9th", "tenth", "10th",
            "eleventh", "11th", "twelfth", "12th", "teenth", "13th", "tieth"};


    /**
     * @param tc
     * @return true if phrase contains >50% numeric tokens (including ordinals)
     */
    public static boolean isNumericContent(String tc) {
        String[] parts = tc.toLowerCase().split("\\s+");
        int countNumericParts = 0;
        for (String p : parts) {
            boolean isPartNumeric = false;
            for (String ord : ORDINALS) {
                if (p.endsWith(ord)) {
                    countNumericParts++;
                    isPartNumeric = true;
                    break;
                }
            }
            if (!isPartNumeric) {
                boolean skip = false;
                int countDigits = 0;
                for (int i = 0; i < p.length(); i++) {
                    if (Character.isDigit(tc.charAt(i))) {
                        countDigits++;
                    }
                    if (Character.isLetter(tc.charAt(i))) {
                        skip = true;
                        break;
                    }
                }
                if (!skip && countDigits > tc.length() * 0.5)
                    countNumericParts++;
            }
        }

        return countNumericParts > parts.length * 0.5;
    }

    public static boolean isEmptyString(String string) {
        string = string.replaceAll("[^a-zA-Z0-9]", ""); //only consider english
        return string.length() == 0;
    }

    public static boolean isEmptyMediaWikiString(String string) {
        if (string.startsWith("{{") && string.endsWith("}}"))
            return true;
        return isEmptyString(string);
    }

    public static boolean isWikiInternalLink(String uri) {
        if (uri == null)
            return false;
        if (uri.startsWith("/")) //pointing to a wikipedia article
            return true;
        else if (uri.startsWith("http") || uri.startsWith("www")) {
            if (uri.indexOf("en.wikipedia") != -1)
                return true;
            return false;
        }
        return false;
    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy