org.sweble.wikitext.lazy.parser.Url.rats Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of swc-parser-lazy Show documentation
Show all versions of swc-parser-lazy Show documentation
A parser for MediaWiki's Wikitext.
/**
* Copyright 2011 The Open Source Research Group,
* University of Erlangen-Nürnberg
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
*
* Url
* ---
*
* Grammar:
* - Protocol ::= [A-Za-z] [A-Za-z0-9+\-.]*
* - Path ::= [^\u0000-\u0020\u007F\uE000\u2028\u2029\u0085\"\[\]<>|]+
* - Url ::= Scheme ':' Path
*
* AST node:
* Name : Url
* Extends : LeafNode
* Constructor : "protocol, path"
* NodeType : org.sweble.wikitext.lazy.AstNodeTypes.NT_URL
*
* Properties:
* protocol : String
* path : String
*
*/
module org.sweble.wikitext.lazy.parser.Url;
// -- Code ---------------------------------------------------------------------
body
{
private boolean isProtocol(Text text, String path)
{
String proto = extractProtocol(text);
if (proto == null)
return false;
proto += ':';
if (path.startsWith("//"))
proto += "//";
return getState().getConfig().isUrlProtocol(proto);
}
private boolean isProtocol(String protocol, String path)
{
protocol += ':';
if (path.startsWith("//"))
protocol += "//";
return getState().getConfig().isUrlProtocol(protocol);
}
private AstNode makeExternalLink(Text text, String path)
{
String p = extractProtocol(text);
String t = text.getContent();
String b = t.substring(0, t.length() - p.length());
Url url = new Url(p, path);
if (isGatherRtData())
addRtData(url, joinRt(p, ':', path));
if (b.isEmpty())
return url;
return new NodeList(new Text(b), url);
}
private String extractProtocol(Text text)
{
String s = text.getContent();
if (s.isEmpty())
return null;
int i = s.length() - 1;
int cp = -1;
boolean validEnd = false;
while (i >= 0)
{
cp = s.codePointAt(i);
// According to http://www.ietf.org/rfc/rfc1738.txt
if ((('A' <= cp) && (cp <= 'Z')) || (('a' <= cp) && (cp <= 'z')))
{
validEnd = true;
}
else if ((('0' <= cp) && (cp <= '9')) || ('+' == cp) || ('-' == cp) || (cp == '.'))
{
validEnd = false;
}
else
{
break;
}
--i;
}
if (validEnd)
{
if (i < 0)
{
return s;
}
else
{
return isPerlWordCharacter(cp) ? null : s.substring(i + 1);
}
}
else
{
return null;
}
}
/**
* According to http://www.php.net/manual/en/regexp.reference.escape.php
*/
private boolean isPerlWordCharacter(int cp)
{
if (cp == '_')
{
return true;
}
else
{
switch (Character.getType(cp))
{
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
case Character.DECIMAL_DIGIT_NUMBER:
return true;
}
}
return false;
}
}
// -- Url ----------------------------------------------------------------------
Url Url =
protocol:UrlProtocolString ':' path:UrlPathString &{ isProtocol(protocol, path) }
{
yyValue = new Url(protocol, path);
if (isGatherRtData())
addRtData(yyValue, joinRt(protocol, ':', path));
}
;
String UrlProtocolString = tUrlProtocolString ;
String UrlPathString = tUrlPathString ;
/* The part before the separator ":".
* According to http://www.ietf.org/rfc/rfc1738.txt
* For example: http, ftp, email, ...
*/
private transient String tUrlProtocolString =
[A-Za-z] [A-Za-z0-9+\-.]*
;
/* Additionally excluded as newline characters: \u2028\u2029\u0085
* FIXME: Is this the official RFC or is this what MediaWiki expects?
*/
private transient String tUrlPathString =
( ![\u0000-\u0020] ![\"\[\]<>|\u007F\uE000\u2028\u2029\u0085] _ )+
;
// -- End of file -------------------------------------------------------------