Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.util;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURIFactory;
/**
* Sort-friendly URI Reordering Transform.
*
* Converts URIs of the form:
*
* scheme://[email protected]:port/path?query#fragment
*
* ...into...
*
* scheme://(tld,domain,:port@userinfo)/path?query#fragment
*
* The '(' ')' characters serve as an unambiguous notice that the so-called
* 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has
* been transformed; the commas prevent confusion with regular hostnames.
*
* This remedies the 'problem' with standard URIs that the host portion of a
* regular URI, with its dotted-domains, is actually in reverse order from
* the natural hierarchy that's usually helpful for grouping and sorting.
*
* The value of respecting URI case variance is considered negligible: it
* is vanishingly rare for case-variance to be meaningful, while URI case-
* variance often arises from people's confusion or sloppiness, and they
* only correct it insofar as necessary to avoid blatant problems. Thus
* the usual SURT form is considered to be flattened to all lowercase, and
* not completely reversible.
*
* @author gojomo
*/
public class SURT {
protected static char DOT = '.';
protected static String BEGIN_TRANSFORMED_AUTHORITY = "(";
protected static String TRANSFORMED_HOST_DELIM = ",";
protected static String END_TRANSFORMED_AUTHORITY = ")";
// 1: scheme://
// 2: userinfo (if present)
// 3: @ (if present)
// 4: dotted-quad host
// 5: other host
// 6: :port
// 7: path
protected static String URI_SPLITTER =
"^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?"+
// 1 2 3
"(?:((?:\\d{1,3}\\.){3}\\d{1,3})|(\\S+?))(:\\d+)?(/\\S*)?$";
// 4 5 6 7
// RFC2396
// reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
// "$" | ","
// unreserved = alphanum | mark
// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
// userinfo = *( unreserved | escaped |
// ";" | ":" | "&" | "=" | "+" | "$" | "," )
// escaped = "%" hex hex
/**
* Utility method for creating the SURT form of the URI in the
* given String.
*
* By default, does not preserve casing.
*
* @param s String URI to be converted to SURT form
* @return SURT form
*/
public static String fromURI(String s) {
return fromURI(s,false);
}
/**
* Utility method for creating the SURT form of the URI in the
* given String.
*
* If it appears a bit convoluted in its approach, note that it was
* optimized to minimize object-creation after allocation-sites profiling
* indicated this method was a top source of garbage in long-running crawls.
*
* Assumes that the String URI has already been cleaned/fixed (eg
* by UURI fixup) in ways that put it in its crawlable form for
* evaluation.
*
* @param s String URI to be converted to SURT form
* @param preserveCase whether original case should be preserved
* @return SURT form
*/
public static String fromURI(String s, boolean preserveCase) {
Matcher m = TextUtils.getMatcher(URI_SPLITTER,s);
if(!m.matches()) {
// not an authority-based URI scheme; return unchanged
TextUtils.recycleMatcher(m);
return s;
}
// preallocate enough space for SURT form, which includes
// 3 extra characters ('(', ')', and one more ',' than '.'s
// in original)
StringBuffer builder = new StringBuffer(s.length()+3);
append(builder,s,m.start(1),m.end(1)); // scheme://
builder.append(BEGIN_TRANSFORMED_AUTHORITY); // '('
if(m.start(4)>-1) {
// dotted-quad ip match: don't reverse
append(builder,s,m.start(4),m.end(4));
} else {
// other hostname match: do reverse
int hostSegEnd = m.end(5);
int hostStart = m.start(5);
for(int i = m.end(5)-1; i>=hostStart; i--) {
if(s.charAt(i-1)!=DOT && i > hostStart) {
continue;
}
append(builder,s,i,hostSegEnd); // rev host segment
builder.append(TRANSFORMED_HOST_DELIM); // ','
hostSegEnd = i-1;
}
}
append(builder,s,m.start(6),m.end(6)); // :port
append(builder,s,m.start(3),m.end(3)); // at
append(builder,s,m.start(2),m.end(2)); // userinfo
builder.append(END_TRANSFORMED_AUTHORITY); // ')'
append(builder,s,m.start(7),m.end(7)); // path
if (!preserveCase) {
for(int i = 0; i < builder.length(); i++) {
builder.setCharAt(i,Character.toLowerCase(builder.charAt((i))));
}
}
TextUtils.recycleMatcher(m);
return builder.toString();
}
private static void append(StringBuffer b, CharSequence cs, int start,
int end) {
if (start < 0) {
return;
}
b.append(cs, start, end);
}
/**
* Given a plain URI or hostname/hostname+path, deduce an implied SURT
* prefix from it. Results may be unpredictable on strings that cannot
* be interpreted as URIs.
*
* UURI 'fixup' is applied to the URI that is built.
*
* @param u URI or almost-URI to consider
* @return implied SURT prefix form
*/
public static String prefixFromPlain(String u) {
u = fromPlain(u);
// truncate to implied prefix
u = SurtPrefixSet.asPrefix(u);
return u;
}
/**
* Given a plain URI or hostname/hostname+path, give its SURT form.
* Results may be unpredictable on strings that cannot
* be interpreted as URIs.
*
* UURI 'fixup' is applied to the URI before conversion to SURT
* form.
*
* @param u URI or almost-URI to consider
* @return implied SURT prefix form
*/
public static String fromPlain(String u) {
u = ArchiveUtils.addImpliedHttpIfNecessary(u);
boolean trailingSlash = u.endsWith("/");
// ensure all typical UURI cleanup (incl. IDN-punycoding) is done
try {
u = UsableURIFactory.getInstance(u).toString();
} catch (URIException e) {
e.printStackTrace();
// allow to continue with original string uri
}
// except: don't let UURI-fixup add a trailing slash
// if it wasn't already there (presence or absence of
// such slash has special meaning specifying implied
// SURT prefixes)
if(!trailingSlash && u.endsWith("/")) {
u = u.substring(0,u.length()-1);
}
// convert to full SURT
u = SURT.fromURI(u);
return u;
}
/**
* Allow class to be used as a command-line tool for converting
* URL lists (or naked host or host/path fragments implied
* to be HTTP URLs) to SURT form. Lines that cannot be converted
* are returned unchanged.
*
*
* Read from stdin or first file argument. Writes to stdout or
* second argument filename
*
* @param args cmd-line arguments
* @throws IOException
*/
public static void main(String[] args) throws IOException {
InputStream in = args.length > 0 ? new BufferedInputStream(
new FileInputStream(args[0])) : System.in;
PrintStream out = args.length > 1 ? new PrintStream(
new BufferedOutputStream(new FileOutputStream(args[1])))
: System.out;
BufferedReader br =
new BufferedReader(new InputStreamReader(in));
String line;
while((line = br.readLine())!=null) {
if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
line = line.trim();
if(line.length()==0) continue;
line = ArchiveUtils.addImpliedHttpIfNecessary(line);
out.println(SURT.fromURI(line));
}
br.close();
out.close();
}
}