All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sweble.wikitext.lazy.parser.Url.rats Maven / Gradle / Ivy

There is a newer version: 3.1.9
Show newest version
/**
 * Copyright 2011 The Open Source Research Group,
 *                University of Erlangen-Nürnberg
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*!
 *
 * Url
 * ---
 *
 *   Grammar:
 *     - Protocol ::= [A-Za-z] [A-Za-z0-9+\-.]*
 *     - Path     ::= [^\u0000-\u0020\u007F\uE000\u2028\u2029\u0085\"\[\]<>|]+
 *     - Url      ::= Scheme ':' Path
 *
 *   AST node:
 *     Name        : Url
 *     Extends     : LeafNode
 *     Constructor : "protocol, path"
 *     NodeType    : org.sweble.wikitext.lazy.AstNodeTypes.NT_URL
 *
 *     Properties:
 *       protocol : String
 *       path     : String
 *
 */

module org.sweble.wikitext.lazy.parser.Url;




// -- Code ---------------------------------------------------------------------

body
{
  private boolean isProtocol(Text text, String path)
  {
    String proto = extractProtocol(text);
    if (proto == null)
      return false;

    proto += ':';
    if (path.startsWith("//"))
      proto += "//";

    return getState().getConfig().isUrlProtocol(proto);
  }

  private boolean isProtocol(String protocol, String path)
  {
    protocol += ':';
    if (path.startsWith("//"))
      protocol += "//";

    return getState().getConfig().isUrlProtocol(protocol);
  }

  private AstNode makeExternalLink(Text text, String path)
  {
    String p = extractProtocol(text);
    String t = text.getContent();
    String b = t.substring(0, t.length() - p.length());

    Url url = new Url(p, path);
    if (isGatherRtData())
      addRtData(url, joinRt(p, ':', path));

    if (b.isEmpty())
      return url;

    return new NodeList(new Text(b), url);
  }

  private String extractProtocol(Text text)
  {
    String s = text.getContent();
    if (s.isEmpty())
      return null;

    int i = s.length() - 1;
    int cp = -1;
    boolean validEnd = false;

    while (i >= 0)
    {
      cp = s.codePointAt(i);

      // According to http://www.ietf.org/rfc/rfc1738.txt
      if ((('A' <= cp) && (cp <= 'Z')) || (('a' <= cp) && (cp <= 'z')))
      {
        validEnd = true;
      }
      else if ((('0' <= cp) && (cp <= '9')) || ('+' == cp) || ('-' == cp) || (cp == '.'))
      {
        validEnd = false;
      }
      else
      {
        break;
      }

      --i;
    }

    if (validEnd)
    {
      if (i < 0)
      {
        return s;
      }
      else
      {
        return isPerlWordCharacter(cp) ? null : s.substring(i + 1);
      }
    }
    else
    {
      return null;
    }
  }

  /**
   * According to http://www.php.net/manual/en/regexp.reference.escape.php
   */
  private boolean isPerlWordCharacter(int cp)
  {
    if (cp == '_')
    {
      return true;
    }
    else
    {
      switch (Character.getType(cp))
      {
        case Character.UPPERCASE_LETTER:
        case Character.LOWERCASE_LETTER:
        case Character.TITLECASE_LETTER:
        case Character.MODIFIER_LETTER:
        case Character.OTHER_LETTER:
        case Character.DECIMAL_DIGIT_NUMBER:
          return true;
      }
    }

    return false;
  }
}




// -- Url ----------------------------------------------------------------------

Url Url =
  protocol:UrlProtocolString ':' path:UrlPathString &{ isProtocol(protocol, path) }
  {
    yyValue = new Url(protocol, path);
    if (isGatherRtData())
      addRtData(yyValue, joinRt(protocol, ':', path));
  }
;

String UrlProtocolString = tUrlProtocolString ;

String UrlPathString = tUrlPathString ;

/* The part before the separator ":".
 * According to http://www.ietf.org/rfc/rfc1738.txt
 * For example: http, ftp, email, ...
 */
private transient String tUrlProtocolString =
  [A-Za-z] [A-Za-z0-9+\-.]*
;

/* Additionally excluded as newline characters: \u2028\u2029\u0085
 * FIXME: Is this the official RFC or is this what MediaWiki expects?
 */
private transient String tUrlPathString =
  ( ![\u0000-\u0020] ![\"\[\]<>|\u007F\uE000\u2028\u2029\u0085] _ )+
;




// -- End of file -------------------------------------------------------------




© 2015 - 2024 Weber Informatics LLC | Privacy Policy