All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.smartlogic.tools.LabelForUriManglator Maven / Gradle / Ivy

package com.smartlogic.tools;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.regex.Pattern;

import org.apache.commons.text.WordUtils;
import org.apache.jena.ext.com.google.common.base.CharMatcher;
import org.apache.jena.ext.com.google.common.base.Strings;
import org.apache.jena.ext.com.google.common.collect.ImmutableMap;

/**
 * Utilities for converting labels to part of prefLabel and altLabel URIs.
 */
@SuppressWarnings("deprecation")
public class LabelForUriManglator {

	private static final CharMatcher SPECIAL_ASCII_MATCHER = CharMatcher.ascii()
			.and(CharMatcher.javaLetterOrDigit().negate()).and(CharMatcher.noneOf("._ -"));

	public static String mangle(String label) {
		label = simplify(label);
		label = stripSpecialAsciiCharacters(label);
		label = toCamelCase(label);
		label = applyEndodingWithStripingOfPercents(label);
		label = stripRepeatedHyphens(label);
		label = fixCornerCasesForJena(label);
		return label;
	}

	private static String stripSpecialAsciiCharacters(String label) {
		return SPECIAL_ASCII_MATCHER.replaceFrom(label, "-");
	}

	private static String toCamelCase(String label) {
		if (label.length() >= 1) {
			String first = label.substring(0, 1);
			label = WordUtils.capitalize(label);
			label = first + label.substring(1);
			return label.replaceAll(" ", "");
		} else {
			return label;
		}
	}

	private static String applyEndodingWithStripingOfPercents(String label) {
		label = urlEncode(label);
		return label.replace("%", "");
	}

	private static String stripRepeatedHyphens(String label) {
		return label.replaceAll("[-]+", "-");
	}

	/**
	 * 
	 * 1. leading '-' is prefixed by '_' because of URI compatibility issues
	 * 2. empty labels is converted to '_'
	 * 
*/ private static String fixCornerCasesForJena(String label) { if (Strings.isNullOrEmpty(label)) { return "_"; } if (label.startsWith("-")) { return "_" + label.substring(1); } return label; } private static String urlEncode(String label) { try { return URLEncoder.encode(label, StandardCharsets.UTF_8.name()); } catch (UnsupportedEncodingException uee) { throw new RuntimeException("Unsupported encoding.", uee); } } private static final ImmutableMap NONDIACRITICS = ImmutableMap.builder() // Replace non-diacritics as their equivalent characters .put("\u0141", "l") // BiaLystock .put("\u0142", "l") // Bialystock .put("ß", "ss").put("æ", "ae").put("ø", "o").put("©", "c").put("\u00D0", "d") // All Ð ð // from // http://de.wikipedia.org/wiki/%C3%90 .put("\u00F0", "d").put("\u0110", "d").put("\u0111", "d").put("\u0189", "d").put("\u0256", "d") .put("\u00DE", "th") // thorn Þ .put("\u00FE", "th") // thorn þ .build(); /* * Special regular expression character ranges relevant for simplification -> * see http://docstore.mik.ua/orelly/perl/prog3/ch05_04.htm * InCombiningDiacriticalMarks: special marks that are part of "normal" ä, ö, î * etc.. IsSk: Symbol, Modifier see * http://www.fileformat.info/info/unicode/category/Sk/list.htm IsLm: Letter, * Modifier see http://www.fileformat.info/info/unicode/category/Lm/list.htm */ private static final Pattern DIACRITICS_AND_FRIENDS = Pattern .compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); public static String simplify(String str) { str = stripDiacritics(str); return stripNonDiacritics(str); } private static String stripNonDiacritics(String str) { StringBuffer ret = new StringBuffer(); for (int i = 0; i < str.length(); i++) { String source = str.substring(i, i + 1); String replace = NONDIACRITICS.get(source); if (replace == null) { ret.append(source); } else { ret.append(replace); } } return ret.toString(); } private static String stripDiacritics(String str) { str = Normalizer.normalize(str, Normalizer.Form.NFD); return DIACRITICS_AND_FRIENDS.matcher(str).replaceAll(""); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy