org.kefirsf.bb.proc.ProcUrl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kefirbb Show documentation
Show all versions of kefirbb Show documentation
KefirBB is a Java-library for text processing. Initially it was developed for BB2HTML translation.
But flexible configuration allows to use it in different cases. For example for parsing Markdown, Textile,
and for HTML filtration.
The newest version!
package org.kefirsf.bb.proc;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
/**
* The pattern element to parse URLs.
*
* @author kefir
*/
public class ProcUrl extends AbstractUrl {
private static final String pct_encoded = "(%\\p{XDigit}{2})";
private static final String pchar = "[\\w~!$&'*+,;=:@\\(\\)\\.\\-]|"+pct_encoded;
private static final Pattern REGEX_PORT = Pattern.compile(
":\\d{1,4}"
);
private static final Pattern REGEX_PATH = Pattern.compile(
"(/(" + pchar + ")+)*/?"
);
private static final Pattern REGEX_FRAGMENT = Pattern.compile(
"#(" + pchar + "|[/?])*"
);
private static final Pattern REGEX_LOCAL_PREFIX = Pattern.compile("\\.{0,2}/");
private static final String[] LOCAL_PREFIXES = {"/", "./", "../"};
private final boolean local;
private final boolean schemaless;
/**
* Create a named URL variable
*
* @param name variable name
* @param ghost don't move the cursor after parsing
* @param local Parse local URLs also
* @param schemaless Parse only schemaless URL
*/
public ProcUrl(String name, boolean ghost, boolean local, boolean schemaless) {
super(name, ghost);
this.local = local;
this.schemaless = schemaless;
}
/**
* {@inheritDoc}
*/
@Override
public int findIn(Source source) {
if (schemaless) {
return -1;
}
int start = source.getOffset();
int sourceLength = source.length();
int index;
int length = -1;
do {
index = sourceLength;
// Prepare URL's prefixes.
List prefixes = preparePrefixes();
// Find nearest prefix
for (String prefix : prefixes) {
int ni = source.findFrom(start, prefix.toCharArray(), true);
if (ni > 0 && ni < index) {
index = ni;
}
}
// Try to parse it
if (index < sourceLength) {
length = parseLength(source, index, null);
if (length < 0) {
start = index + 1;
}
}
} while (length < 0 && index < sourceLength);
if (length >= 0) {
return index;
} else {
return -1;
}
}
/**
* Prepare URL's prefixes.
*
* @return list of schema prefixes and local prefixes if local URL are allowed.
*/
private List preparePrefixes() {
// Prepare prefixes for all schemas
List prefixes = new ArrayList(Schema.values().length + (local ? 3 : 0));
for (Schema schema : Schema.values()) {
prefixes.add(schema.getPrefix());
}
// For local URls prefixes are "./", "../", "/"
if (local) {
Collections.addAll(prefixes, LOCAL_PREFIXES);
}
return prefixes;
}
/**
* Parse URL. The offset must be on a URL element
*
* @param source text source
* @param offset offset for parsing
* @param terminator a terminator element which can be used to cut some URL parts. Can be null.
* @return URL length or -1 if it is not a URL.
*/
@Override
int parseLength(Source source, int offset, ProcPatternElement terminator) {
int length = 0;
// A schema like http://, https://, mailto:
Schema schema = parseSchema(source, offset);
if (schema != null && !schemaless) {
length += schema.getLength();
} else if ((schema == null && !local && !schemaless) || (schema != null)) {
return -1;
}
// An authority data like john.smith:pa55W0RD@
if (schema != null) {
int authorityLength = parseAuthority(source, offset + length);
if (schema.isAuthorityMandatory() && authorityLength <= 0) {
return -1;
}
length += authorityLength;
}
// A host like example.com
if (schema != null || schemaless) {
int hostLength = parseHost(source, offset + length, terminator);
if (hostLength <= 0) {
return -1;
}
length += hostLength;
}
// Parse port
if (schema != null || schemaless) {
int portLength = parsePort(source, offset + length);
length += portLength;
}
// For local URLs it is possible to use "./", "../", "/"
if (schema == null && local) {
int prefixLength = parseRegex(source, offset, calcEnd(source, terminator), REGEX_LOCAL_PREFIX);
if (prefixLength <= 0) {
return -1;
}
length += prefixLength - 1;
}
// A path like /home/web
int pathLength = parsePath(source, offset + length, terminator);
if (local && schema == null && pathLength <= 0) {
return -1;
}
length += pathLength;
// A query like ?key1=value1&key2=value2
length += parseQuery(source, offset + length, terminator);
// A fragment like #anchor
length += parseFragment(source, offset + length, terminator);
return length;
}
int parseFragment(Source source, int offset, ProcPatternElement terminator) {
return parseRegex(source, offset, calcEnd(source, terminator), REGEX_FRAGMENT);
}
int parsePath(Source source, int offset, ProcPatternElement terminator) {
return parseRegex(source, offset, calcEnd(source, terminator), REGEX_PATH);
}
private int parsePort(Source source, int offset) {
return parseRegex(source, offset, source.length(), REGEX_PORT);
}
Schema parseSchema(Source source, int offset) {
for (Schema schema : Schema.values()) {
String str = source.subSequence(offset, Math.min(offset + schema.getLength(), source.length())).toString();
if (schema.getPrefix().equalsIgnoreCase(str)) {
return schema;
}
}
return null;
}
enum Schema {
HTTP("http://"),
HTTPS("https://"),
FTP("ftp://"),
MAILTO("mailto:", true);
private final String prefix;
private final boolean authorityMandatory;
Schema(String prefix) {
this.prefix = prefix;
authorityMandatory = false;
}
Schema(String prefix, boolean authorityMandatory) {
this.prefix = prefix;
this.authorityMandatory = authorityMandatory;
}
public String getPrefix() {
return prefix;
}
public boolean isAuthorityMandatory() {
return authorityMandatory;
}
public int getLength() {
return prefix.length();
}
}
@Override
public String toString() {
return MessageFormat.format(
"",
getName(), ghost, local, schemaless
);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy