Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.util;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;
import org.archive.url.UsableURI;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
/**
* Specialized TreeSet for keeping a set of String prefixes.
*
* Redundant prefixes (those that are themselves prefixed
* by other set entries) are eliminated.
*
* @author gojomo
*/
public class SurtPrefixSet extends PrefixSet {
private static final long serialVersionUID = 2598365040524933110L;
private static final String SURT_PREFIX_DIRECTIVE = "+";
/**
* Read a set of SURT prefixes from a reader source; keep sorted and
* with redundant entries removed.
*
* @param r reader over file of SURT_format strings
* @throws IOException
*/
public void importFrom(Reader r) {
BufferedReader reader = new BufferedReader(r);
String s;
Iterator iter =
new RegexLineIterator(
new LineReadingIterator(reader),
RegexLineIterator.COMMENT_LINE,
RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
RegexLineIterator.ENTRY);
while (iter.hasNext()) {
s = (String) iter.next();
add(s.toLowerCase());
}
}
/**
* @param r Where to read from.
*/
public void importFromUris(Reader r) {
BufferedReader reader = new BufferedReader(r);
String s;
Iterator iter =
new RegexLineIterator(
new LineReadingIterator(reader),
RegexLineIterator.COMMENT_LINE,
RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
RegexLineIterator.ENTRY);
while (iter.hasNext()) {
s = (String) iter.next();
// s is a URI (or even fragmentary hostname), not a SURT
addFromPlain(s);
}
}
/**
* Import SURT prefixes from a reader with mixed URI and SURT prefix
* format.
*
* @param r the reader to import the prefixes from
* @param deduceFromSeeds true to also import SURT prefixes implied
* from normal URIs/hostname seeds
*/
public void importFromMixed(Reader r, boolean deduceFromSeeds) {
BufferedReader reader = new BufferedReader(r);
String s;
Iterator iter =
new RegexLineIterator(
new LineReadingIterator(reader),
RegexLineIterator.COMMENT_LINE,
RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
RegexLineIterator.ENTRY);
while (iter.hasNext()) {
s = (String) iter.next();
if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
considerAsAddDirective(s.substring(SURT_PREFIX_DIRECTIVE.length()));
continue;
} else {
if(deduceFromSeeds) {
// also deducing 'implied' SURT prefixes
// from normal URIs/hostname seeds
addFromPlain(s);
}
}
}
}
/**
* Interpret the given SURT/URI/host as a directive, returning true
* if it was meaningful.
*
* @param suri potential directive
* @return boolean true if applied as directive, false otherwise
*/
public boolean considerAsAddDirective(String suri) {
String u = suri.trim();
if(u.length()==0) {
// empty string: consider a mistake
return false;
}
if(u.indexOf("(")>0) {
// formal SURT prefix; toLowerCase just in case
add(u.toLowerCase());
} else {
// hostname/normal form URI from which
// to deduce SURT prefix
addFromPlain(u);
}
return true;
}
/**
* Given a plain URI or hostname, deduce an implied SURT prefix from
* it and add to active prefixes.
*
* @param u String of URI or hostname
*/
public void addFromPlain(String u) {
u = prefixFromPlainForceHttp(u);
add(u);
}
/**
* Given a plain URI or hostname/hostname+path, deduce an implied SURT
* prefix from it. Results may be unpredictable on strings that cannot
* be interpreted as URIs.
*
* UURI 'fixup' is applied to the URI that is built.
*
* HTTPS URIs are changed to HTTP as a convenience for the usual
* preferred common-treatment.
*
* @param u URI or almost-URI to consider
* @return implied SURT prefix form
*/
public static String prefixFromPlainForceHttp(String u) {
u = SURT.prefixFromPlain(u);
u = coerceFromHttpsForComparison(u);
return u;
}
/**
* For SURT comparisons -- prefixes or candidates being checked against
* those prefixes -- we treat https URIs as if they were http.
*
* @param u string to coerce if it has https scheme
* @return string converted to http scheme, or original if not necessary
*/
private static String coerceFromHttpsForComparison(String u) {
if (u.startsWith("https://")) {
u = "http" + u.substring("https".length());
}
return u;
}
/**
* Utility method for truncating a SURT that came from a
* full URI (as a seed, for example) into a prefix
* for determining inclusion.
*
* This involves:
*
* (1) removing the last path component, if any
* (anything after the last '/', if there are
* at least 3 '/'s)
* (2) removing a trailing ')', if present, opening
* the possibility of proper subdomains. (This
* means that the presence or absence of a
* trailing '/' after a hostname in a seed list
* is significant for the how the SURT prefix is
* created, even though it is not signficant for
* the URI's treatment as a seed.)
*
*
* @param s String to work on.
* @return As prefix.
*/
public static String asPrefix(String s) {
// Strip last path-segment, if more than 3 slashes
s = s.replaceAll("^(.*//.*/)[^/]*","$1");
// Strip trailing ")", if present and NO path (no 3rd slash).
if (!s.endsWith("/")) {
s = s.replaceAll("^(.*)\\)","$1");
}
return s;
}
/**
* Calculate the SURT form URI to use as a candidate against prefixes
* from the given Object (CandidateURI or UURI)
*
* @param object CandidateURI or UURI
* @return SURT form of URI for evaluation, or null if unavailable
*/
public static String getCandidateSurt(UsableURI u) {
if (u == null) {
return null;
}
String candidateSurt = u.getSurtForm();
// also want to treat https as http
candidateSurt = coerceFromHttpsForComparison(candidateSurt);
return candidateSurt;
}
/**
* @param fw
* @throws IOException
*/
public void exportTo(FileWriter fw) throws IOException {
Iterator iter = this.iterator();
while(iter.hasNext()) {
fw.write((String)iter.next() + "\n");
}
}
/**
* Changes all prefixes so that they enforce an exact host. For
* prefixes that already include a ')', this means discarding
* anything after ')' (path info). For prefixes that don't include
* a ')' -- domain prefixes open to subdomains -- add the closing
* ')' (or ",)").
*/
public void convertAllPrefixesToHosts() {
SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
Iterator iter = iterCopy.iterator();
while (iter.hasNext()) {
String prefix = (String) iter.next();
String convPrefix = convertPrefixToHost(prefix);
if(prefix!=convPrefix) {
// if returned value not unchanged, update set
this.remove(prefix);
this.add(convPrefix);
}
}
}
public static String convertPrefixToHost(String prefix) {
if(prefix.endsWith(")")) {
return prefix; // no change necessary
}
if(prefix.indexOf(')')<0) {
// open-ended domain prefix
if(!prefix.endsWith(",")) {
prefix += ",";
}
prefix += ")";
} else {
// prefix with excess path-info
prefix = prefix.substring(0,prefix.indexOf(')')+1);
}
return prefix;
}
/**
* Changes all prefixes so that they only enforce a general
* domain (allowing subdomains).For prefixes that don't include
* a ')', no change is necessary. For others, truncate everything
* from the ')' onward. Additionally, truncate off "www," if it
* appears.
*/
public void convertAllPrefixesToDomains() {
SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
Iterator iter = iterCopy.iterator();
while (iter.hasNext()) {
String prefix = (String) iter.next();
String convPrefix = convertPrefixToDomain(prefix);
if(prefix!=convPrefix) {
// if returned value not unchanged, update set
this.remove(prefix);
this.add(convPrefix);
}
}
}
public static String convertPrefixToDomain(String prefix) {
if(prefix.indexOf(')')>=0) {
prefix = prefix.substring(0,prefix.indexOf(')'));
}
// strip 'www,' when present
if(prefix.endsWith("www,")) {
prefix = prefix.substring(0,prefix.length()-4);
}
return prefix;
}
/**
* Allow class to be used as a command-line tool for converting
* URL lists (or naked host or host/path fragments implied
* to be HTTP URLs) to implied SURT prefix form.
*
* Read from stdin or first file argument. Writes to stdout.
*
* @param args cmd-line arguments: may include input file
* @throws IOException
*/
public static void main(String[] args) throws IOException {
InputStream in = args.length > 0 ? new BufferedInputStream(
new FileInputStream(args[0])) : System.in;
PrintStream out = args.length > 1 ? new PrintStream(
new BufferedOutputStream(new FileOutputStream(args[1])))
: System.out;
BufferedReader br =
new BufferedReader(new InputStreamReader(in));
String line;
while((line = br.readLine())!=null) {
if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
line = line.trim();
if(line.length()==0) continue;
out.println(prefixFromPlainForceHttp(line));
}
br.close();
out.close();
}
}