org.sweble.wikitext.lazy.LinkTargetParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of swc-parser-lazy Show documentation
A parser for MediaWiki's Wikitext.
There is a newer version: 3.1.9
/**
 * Copyright 2011 The Open Source Research Group,
 *                University of Erlangen-Nürnberg
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sweble.wikitext.lazy;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.fau.cs.osr.utils.XmlGrammar;

/**
 * Expects the string to contain only valid Unicode characters. It must not
 * contain invalid, non- or private use characters. It further expects the
 * string to not contain the following characters:
 * [\u0000-\u001F\u007F\uFFFD<>{}|[\]].
 *
 * The parser checks if the link target contains any of the following entites,
 * which are not allowed in link targets:
 *
 * 
 * Percent encoding of URIs:
 *
 *  * %[0-9A-Fa-f]{2}
 * 
 *
 * 
 * XML entity references:
 *
 *  * &<Name>;
 * 
 *
 * 
 * XML char references:
 *
 *  * (&#[0-9]+;)|(&#x[0-9A-Fa-f]+;)
 * 
 *
 * 
 * Relative path components:
 *
 *  * (^\.\.?($|/))|(/\.\.?/)|(/\.\.?$)
 * 
 *
 * 
 * No magic tilde sequences:
 *
 *  * ~~~
 * 
 *
 * 
 * 
 */
public class LinkTargetParser
{
	private String title;

	private String fragment;

	private String namespace;

	private String interwiki;

	private boolean initialColon;

	// =========================================================================

	private final static Pattern bidiCharPattern = Pattern.compile(
	        "[\u200E\u200F\u202A-\u202E]");

	private final static Pattern spacePlusPattern = Pattern.compile(
	        "[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+");

	private final static Pattern namespaceSeparatorPattern = Pattern.compile(
	        "^(.+?)_*:_*(.*)$");

	private final static Pattern invalidTitle = Pattern.compile(
	        // Percent encoding for URIs
	        "(%[0-9A-Fa-f]{2})" +

	                // XML entity reference
	                "|(&" + XmlGrammar.RE_XML_NAME + ";)" +

	                // XML char reference
	                "|((&#[0-9]+;)|(&#x[0-9A-Fa-f]+;))" +

	                // Relative path components
	                "|(^\\.\\.?($|/))" +
	                "|(/\\.\\.?/)" +
	                "|(/\\.\\.?$)" +

	                // No magic tilde sequences
	                "|(~~~)");

	// =========================================================================

	public void parse(ParserConfigInterface config, final String target) throws LinkTargetException
	{
		String result = target;
		String resultNs = null;
		String resultIw = null;
		String resultFragment = null;
		boolean resultInitialColon = false;

		// Decode URL encoded characters
		{
			result = urlDecode(result);
		}

		// Decode XML entities
		{
			result = xmlDecode(config, result);
		}

		// Strip bidi override characters
		{
			Matcher matcher = bidiCharPattern.matcher(result);
			result = matcher.replaceAll("");
		}

		// Trim whitespace
		{
			result = trim(result);
		}

		// Strip whitespace characters
		{
			Matcher matcher = spacePlusPattern.matcher(result);
			result = matcher.replaceAll("_");
		}

		// Remove trailing whitespace characters
		result = trimUnderscore(result);

		if (result.isEmpty())
			throw new LinkTargetException(target, "Target has empty title");

		if (result.charAt(0) == ':')
		{
			resultInitialColon = true;
			result = result.substring(1);
			result = trimUnderscore(result);
		}

		boolean gotIl = false;
		boolean gotNsOrIl = false;

		while (true)
		{
			Matcher matcher = namespaceSeparatorPattern.matcher(result);
			if (matcher.matches())
			{
				String nsName = matcher.group(1);

				if (config.isNamespace(nsName))
				{
					// if part was already a namespace, this is wrong ...
					if (gotNsOrIl)
						throw new LinkTargetException(
						        target,
						        "The namespace in a link target may not be followed by another namespace or interwiki name");

					result = matcher.group(2);
					resultNs = nsName;

					gotNsOrIl = true;
				}
				else
				{
					if (config.isInterwikiName(nsName))
					{
						if (gotNsOrIl || gotIl)
							throw new LinkTargetException(
							        target,
							        "The namespace in a link target may not be followed by another namespace or interwiki name");

						result = matcher.group(2);

						if (config.isLocalInterwikiName(nsName))
						{
							if (result.isEmpty())
								throw new LinkTargetException(
								        target,
								        "Empty article title!");

							gotIl = true;
						}
						else
						{
							resultIw = nsName;

							if (!result.isEmpty() && result.charAt(0) == ':')
							{
								resultInitialColon = true;
								result = result.substring(1);
								result = trimUnderscore(result);
							}

							gotNsOrIl = true;
						}
					}
					else
						break;
				}
			}
			else
				break;
		}

		{
			int i = result.indexOf('#');
			if (i != -1)
			{
				resultFragment = result.substring(i + 1);
				resultFragment = trimUnderscore(resultFragment);

				result = result.substring(0, i);
				result = trimUnderscore(result);
			}
		}

		{
			Matcher matcher = invalidTitle.matcher(result);
			if (matcher.find())
				throw new LinkTargetException(
				        target,
				        "The title contains invalid entities");
		}

		// Empty links to a namespace alone are not allowed
		if (result.isEmpty() &&
		        resultIw == null &&
		        resultNs != null)
		{
			throw new LinkTargetException(
			        target,
			        "A namespace alone is not a valid link target");
		}

		this.title = result;
		this.fragment = resultFragment;
		this.namespace = resultNs;
		this.interwiki = resultIw;
		this.initialColon = resultInitialColon;
	}

	static String urlDecode(String result)
	{
		// It's intentional that only '%' characters trigger the decoding.
		// MediaWiki does not decode '+' characters if there's not at least
		// one '%' character :D
		if (result.indexOf('%') >= 0)
		{
			StringBuilder b = new StringBuilder();
			for (int i = 0; i < result.length(); ++i)
			{
				char ch = result.charAt(i);
				if (ch == '%' && i + 2 < result.length())
				{
					String num = result.substring(i + 1, i + 3);
					try
					{
						int val = Integer.valueOf(num, 16);
						if (val >= 0x20 && val < 0x7F)
						{
							ch = (char) val;
							i += 2;
						}
					}
					catch (NumberFormatException e)
					{
					}
				}
				else if (ch == '+')
				{
					ch = ' ';
				}

				b.append(ch);
			}
			result = b.toString();
		}

		return result;
	}

	static String xmlDecode(ParserConfigInterface config, String result)
	{
		if (result.indexOf('&') >= 0)
		{
			Pattern rx = XmlGrammar.xmlReference();

			int start = 0;
			StringBuilder b = new StringBuilder();
			while (true)
			{
				Matcher m = rx.matcher(result);
				if (m.find(start))
				{
					b.append(result.substring(start, m.start()));

					String resolved = null;
					if (m.group(1) != null)
					{
						resolved = config.resolveXmlEntity(m.group(1));
					}
					else
					{
						try
						{
							boolean decimal = m.group(2) != null;

							String num = decimal ? m.group(2) : m.group(3);

							int val = Integer.valueOf(num, decimal ? 10 : 16);
							if (val >= 0x20 && val != 0x7F)
								resolved = String.valueOf((char) val);
						}
						catch (NumberFormatException e)
						{
						}
					}

					if (resolved != null)
						b.append(resolved);
					else
						b.append(result.substring(m.start(), m.end()));

					start = m.end();
				}
				else
				{
					if (start < result.length())
						b.append(result.substring(start));

					break;
				}
			}

			result = b.toString();
		}

		return result;
	}

	static String trimUnderscore(String input)
	{
		int i = 0;
		int j = input.length();

		while ((i < j) && input.charAt(i) == '_')
			++i;

		while ((i < j) && input.charAt(j - 1) == '_')
			--j;

		return ((i > 0) || (j < input.length())) ? input.substring(i, j) : input;
	}

	static String trim(String input)
	{
		int i = 0;
		int j = input.length();

		while ((i < j) && Character.isWhitespace(input.charAt(i)))
			++i;

		while ((i < j) && Character.isWhitespace(input.charAt(j - 1)))
			--j;

		return ((i > 0) || (j < input.length())) ? input.substring(i, j) : input;
	}

	// =========================================================================

	public String getTitle()
	{
		return title;
	}

	public String getFragment()
	{
		return fragment;
	}

	public String getNamespace()
	{
		return namespace;
	}

	public String getInterwiki()
	{
		return interwiki;
	}

	public boolean isInitialColon()
	{
		return initialColon;
	}
}