de.undercouch.citeproc.helper.StringHelper Maven / Gradle / Ivy
package de.undercouch.citeproc.helper;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Helper methods related to Strings
* @author Michel Kraemer
*/
public class StringHelper {
/**
* Hexadecimal characters
*/
private final static char[] HEX_DIGITS = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
/**
* Words that should not converted to title-case
* See stop-words.json
*/
private final static String TITLECASE_STOPWORD_FOLLOWEDBY = "[^\\p{L}\\d_:'\"‘’“”]+";
private final static Pattern[] TITLECASE_STOPWORD_PATTERNS;
static {
String p = TITLECASE_STOPWORD_FOLLOWEDBY;
String[] stopwords = new String[] {
"^a" + p, "^according\\s+to" + p, "^across" + p, "^afore" + p,
"^after" + p, "^against" + p, "^ahead\\s+of" + p, "^along" + p,
"^alongside" + p, "^amid" + p, "^amidst" + p, "^among" + p,
"^amongst" + p, "^an" + p, "^and" + p, "^anenst" + p,
"^apart\\s+from" + p, "^apropos" + p, "^apud" + p,
"^around" + p, "^as" + p, "^as\\s+regards" + p, "^aside" + p,
"^astride" + p, "^at" + p, "^athwart" + p, "^atop" + p,
"^back\\s+to" + p, "^barring" + p, "^because\\s+of" + p,
"^before" + p, "^behind" + p, "^below" + p, "^beneath" + p,
"^beside" + p, "^besides" + p, "^between" + p, "^beyond" + p,
"^but" + p, "^by" + p, "^c" + p, "^ca" + p, "^circa" + p,
"^close\\s+to" + p, "^d['’](?=\\p{L})", "^de" + p, "^despite" + p,
"^down" + p, "^due\\s+to" + p, "^during" + p, "^et" + p,
"^except" + p, "^far\\s+from" + p, "^for" + p, "^forenenst" + p,
"^from" + p, "^given" + p, "^in" + p, "^inside" + p,
"^instead\\s+of" + p, "^into" + p, "^lest" + p, "^like" + p,
"^modulo" + p, "^near" + p, "^next" + p, "^nor" + p,
"^notwithstanding" + p, "^of" + p, "^off" + p, "^on" + p,
"^onto" + p, "^or" + p, "^out" + p, "^outside\\s+of" + p,
"^over" + p, "^per" + p, "^plus" + p, "^prior\\s+to" + p,
"^pro" + p, "^pursuant\\s+to" + p, "^qua" + p,
"^rather\\s+than" + p, "^regardless\\s+of" + p, "^sans" + p,
"^since" + p, "^so" + p, "^such\\s+as" + p, "^than" + p,
"^that\\s+of" + p, "^the" + p, "^through" + p,
"^throughout" + p, "^thru" + p, "^thruout" + p, "^till" + p,
"^to" + p, "^toward" + p, "^towards" + p, "^under" + p,
"^underneath" + p, "^until" + p, "^unto" + p, "^up" + p,
"^upon" + p, "^v\\." + p, "^van" + p, "^versus" + p, "^via" + p,
"^vis-à-vis" + p, "^von" + p, "^vs\\." + p, "^where\\s+as" + p,
"^with" + p, "^within" + p, "^without" + p, "^yet" + p
};
// look for longest matches first
Arrays.sort(stopwords, Comparator.comparingInt(String::length).reversed());
// compile to regex
TITLECASE_STOPWORD_PATTERNS = new Pattern[stopwords.length];
for (int i = 0; i < stopwords.length; ++i) {
TITLECASE_STOPWORD_PATTERNS[i] = Pattern.compile(stopwords[i],
Pattern.CASE_INSENSITIVE);
}
}
private static final Pattern WORD_PATTERN =
Pattern.compile("^[\\p{L}\\d][\\p{L}\\d\\[\\]()'\u2019&]*");
/**
* Based on Markdown by John Gruber
* (https://daringfireball.net/projects/markdown/).
* Released under a BSD-style license.
*/
private static final Pattern MAIL_PATTERN =
Pattern.compile("^(mailto:)?([\\w.-]+@[a-z0-9-]+(\\.[a-z0-9-]+)*\\.[a-z]+)", Pattern.CASE_INSENSITIVE);
/**
* Based on John Gruber's URL regex
* (https://gist.github.com/gruber/8891611 or
* https://daringfireball.net/2010/07/improved_regex_for_matching_urls)
* released under public domain. Slightly simplified.
*/
private static final String TLDs = "(com|net|org|edu|gov|mil|aero|asia|biz|cat|" +
"coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|" +
"ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|" +
"bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|" +
"cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|" +
"es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|" +
"gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|" +
"je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|" +
"lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|" +
"mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|" +
"pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|" +
"si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|" +
"tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|" +
"vn|vu|wf|ws|ye|yt|yu|za|zm|zw)";
private static final Pattern URL_PATTERN =
Pattern.compile("^((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\\-]+[.]" + TLDs + "/)" +
"(?:[^\\s()<>{}\\[\\]]+|\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|" +
"\\(\\S+?\\))+(?:\\([^\\s()]*?\\([^\\s()]+\\)[^\\s()]*?\\)|" +
"\\(\\S+?\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’])|" +
"(?= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9')) {
sb.append(c);
} else {
sb.append('_');
}
break;
}
}
return sb.toString();
}
/**
* Escapes characters in the given string according to Java rules
* @param s the string to escape
* @return the escpaped string
*/
public static String escapeJava(String s) {
if (s == null) {
return null;
}
StringBuilder sb = new StringBuilder(Math.min(2, s.length() * 3 / 2));
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (c == '\b') {
sb.append("\\b");
} else if (c == '\n') {
sb.append("\\n");
} else if (c == '\t') {
sb.append("\\t");
} else if (c == '\f') {
sb.append("\\f");
} else if (c == '\r') {
sb.append("\\r");
} else if (c == '\\') {
sb.append("\\\\");
} else if (c == '"') {
sb.append("\\\"");
} else if (c < 32 || c > 0x7f) {
sb.append("\\u");
sb.append(hex4(c));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Converts the given character to a four-digit hexadecimal string
* @param c the character to convert
* @return the string
*/
private static String hex4(char c) {
char[] r = new char[] { '0', '0', '0', '0' };
int i = 3;
while (c > 0) {
r[i] = HEX_DIGITS[c & 0xF];
c >>>= 4;
--i;
}
return new String(r);
}
/**
* Calculates how many characters overlap between {@code a} and {@code b},
* i.e. how many characters at the end of {@code a} are equal to the ones
* at the beginning of {@code b}.
*
* Examples:
*
* overlap("abcd", "cdef") = 2
* overlap("abcd", "xyz") = 0
* overlap("a", "a") = 1
* overlap("ab", "b") = 1
* overlap("abcd", "bcdefg") = 3
* overlap("", "a") = 0
* overlap("a", "") = 0
*
*
* @param a the first string
* @param b the second string
* @return the number of overlapping characters
*/
public static int overlap(CharSequence a, CharSequence b) {
if (a == null || b == null || a.length() == 0 || b.length() == 0) {
return 0;
}
int start = Math.max(0, a.length() - b.length());
for (int i = start; i < a.length(); ++i) {
int j = 0;
for (; j < b.length() && i + j < a.length(); ++j) {
if (a.charAt(i + j) != b.charAt(j)) {
break;
}
}
if (i + j == a.length()) {
return j;
}
}
return 0;
}
/**
* Check if all characters in the given string are uppercase
* @param s the string
* @return {@code true} if the string contains only uppercase characters
*/
private static boolean isAllUppercase(String s) {
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (c < 'A' || c > 'Z') {
return false;
}
}
return true;
}
/**
* Check if all letters in the given string are uppercase
* @param s the string
* @return {@code true} if the letters in the given string are all uppercase
*/
private static boolean titleAllUppercase(String s) {
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if (Character.isLetter(c) && !Character.isUpperCase(c)) {
return false;
}
}
return true;
}
/**
* Check if a string should be capitalized
* @param s the string
* @return {@code true} if the string should be capitalized
*/
private static boolean shouldCapitalize(String s) {
// do not capitalize single greek characters used as symbols in
// scientific papers
if (s.length() == 1 && s.charAt(0) >= 0x0370 && s.charAt(0) <= 0x03FF) {
return false;
}
for (int i = 1; i < s.length(); ++i) {
if (Character.isUpperCase(s.charAt(i))) {
return false;
}
}
return true;
}
private static boolean shouldStopwordLowercase(String w, String str) {
// exception
if (w.equalsIgnoreCase("d'") || w.equalsIgnoreCase("d’")) {
// check next word
Matcher wm = WORD_PATTERN.matcher(str.substring(2));
// do not lowercase "d'" if the word immediately following it
// is also completely uppercase
return !wm.find() || !titleAllUppercase(str.substring(2, wm.end() + 2));
}
// don't lowercase stop words that are all uppercase
return !titleAllUppercase(w);
}
/**
* Converts the words in a given string to title case (according to the
* CSL specification)
* @param str the string to convert
* @return the converted string
*/
public static String toTitleCase(String str) {
if (str == null) {
return null;
}
if (str.length() == 0) {
return str;
}
// convert all caps title to lowercase
if (titleAllUppercase(str)) {
str = str.toLowerCase(Locale.ENGLISH);
}
StringBuilder sb = new StringBuilder();
int nwords = 0;
int i = 0;
while (i < str.length()) {
String ss = str.substring(i);
int swe = -1;
// check for stop word
if (i > 0 && nwords > 0) {
char prevChar = str.charAt(i - 1);
for (Pattern p : TITLECASE_STOPWORD_PATTERNS) {
Matcher m = p.matcher(ss);
if (m.find()) {
if (ss.charAt(m.end() - 1) == '-' && prevChar != '-') {
// skip stop words followed by a hyphen but not preceded
// by a hyphen (e.g. skip "on" in " On-demand" but not "by"
// in "Step-by-Step")
continue;
}
swe = m.end();
break;
}
}
}
if (swe >= 0) {
String w = ss.substring(0, swe);
if (shouldStopwordLowercase(w, ss)) {
sb.append(w.toLowerCase(Locale.ENGLISH));
} else {
sb.append(w);
}
i += swe;
nwords++;
continue;
}
// check if we found a possessive 's
if (i > 0 && Character.isLetterOrDigit(str.charAt(i - 1))) {
Matcher pm = POSSESSIVE_S_PATTERN.matcher(ss);
if (pm.find()) {
sb.append(ss.substring(0, pm.end()).toLowerCase(Locale.ENGLISH));
i += pm.end();
nwords++;
continue;
}
}
// check for mail addresses
Matcher mam = MAIL_PATTERN.matcher(ss);
if (mam.find()) {
sb.append(ss, 0, mam.end());
i += mam.end();
nwords++;
continue;
}
// check for urls
Matcher um = URL_PATTERN.matcher(ss);
if (um.find()) {
sb.append(ss, 0, um.end());
i += um.end();
nwords++;
continue;
}
// check for normal word
Matcher wm = WORD_PATTERN.matcher(ss);
if (wm.find()) {
String w = ss.substring(0, wm.end());
if (shouldCapitalize(w)) {
w = Character.toTitleCase(w.charAt(0)) + w.substring(1);
}
sb.append(w);
i += wm.end();
nwords++;
continue;
}
char c = str.charAt(i);
if (c == ':' || c == '.' || c == '“' || c == '‘') {
// start a new sentence
nwords = 0;
}
// maybe start a new sentence but only if there are no whitespace
// characters following the quote
boolean maybeNewSentence = c == '"' || c == '\'';
sb.append(c);
++i;
// eat up whitespaces
boolean foundWhitespace = false;
while (i < str.length() && Character.isWhitespace(c = str.charAt(i))) {
sb.append(c);
foundWhitespace = true;
++i;
}
if (maybeNewSentence && !foundWhitespace) {
nwords = 0;
}
}
return sb.toString();
}
/**
* Parse the given name, split it into parts, and convert them to initials
* @param name the name to convert
* @param initializeWith the string to append to each initial
* @return the converted name
*/
public static String initializeName(String name, String initializeWith) {
return initializeName(name, initializeWith, false);
}
/**
* Parse the given name, split it into parts, and either convert them all
* to initials or only normalize existing initials
* @param name the name to convert
* @param initializeWith the string to append to each initial
* @param onlyNormalize {@code true} if only existing initials should be
* normalized and uninitialized names should be kept as is
* @return the converted name
*/
public static String initializeName(String name, String initializeWith,
boolean onlyNormalize) {
// trim string, normalize spaces, normalize hyphens
name = name.trim()
.replaceAll("\\s+", " ")
.replaceAll("\\s*\\.", ".")
.replaceAll("\\.+", ".")
.replaceAll("\\s*[-\u2010\u2011\u2012\u2013\u2014\u2015]+\\s*", "-");
List parts = new ArrayList<>();
int lp = 0;
for (int i = 1; i <= name.length(); ++i) {
if (i == name.length() || name.charAt(i) == ' ') {
if (i > lp) {
String sub = name.substring(lp, i);
parts.add(new NamePart(sub, false, sub.length() == 1 && isAllUppercase(sub)));
}
lp = i + 1;
} else if (name.charAt(i) == '-') {
if (i > lp) {
String sub = name.substring(lp, i);
parts.add(new NamePart(sub, true, sub.length() == 1 && isAllUppercase(sub)));
}
lp = i + 1;
} else if (name.charAt(i) == '.' && (i < name.length() - 1 && name.charAt(i + 1) == '-')) {
if (i > lp) {
parts.add(new NamePart(name.substring(lp, i), true, true));
}
i++;
lp = i + 1;
} else if (name.charAt(i) == '.') {
if (i > lp) {
parts.add(new NamePart(name.substring(lp, i), false, true));
}
lp = i + 1;
}
}
StringBuilder result = new StringBuilder();
for (int i = 0; i < parts.size(); i++) {
NamePart p = parts.get(i);
if (onlyNormalize && i > 0 &&
(!p.alreadyInitialized || !parts.get(i - 1).alreadyInitialized) &&
result.length() > 0 &&
result.charAt(result.length() - 1) != ' ' &&
result.charAt(result.length() - 1) != '-') {
result.append(" ");
}
if (onlyNormalize || p.alreadyInitialized) {
result.append(p.part);
} else {
result.append(p.part.charAt(0));
}
if (!onlyNormalize || p.alreadyInitialized) {
result.append(initializeWith);
}
if (p.hyphen) {
result.append("-");
}
}
return result.toString()
.replaceAll("\\s+-", "-")
.trim();
}
}