All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.util.SurtPrefixSet Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.util;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;

import org.archive.url.UsableURI;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;

/**
 * Specialized TreeSet for keeping a set of String prefixes. 
 * 
 * Redundant prefixes (those that are themselves prefixed
 * by other set entries) are eliminated.
 * 
 * @author gojomo
 */
public class SurtPrefixSet extends PrefixSet {

    private static final long serialVersionUID = 2598365040524933110L;

    private static final String SURT_PREFIX_DIRECTIVE = "+";

    
    /**
     * Read a set of SURT prefixes from a reader source; keep sorted and 
     * with redundant entries removed.
     * 
     * @param r reader over file of SURT_format strings
     * @throws IOException
     */
    public void importFrom(Reader r) {
        BufferedReader reader = new BufferedReader(r);
        String s;
        
        Iterator iter = 
            new RegexLineIterator(
                    new LineReadingIterator(reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);

        while (iter.hasNext()) {
            s = (String) iter.next();
            add(s.toLowerCase());
        }
    }

    /**
     * @param r Where to read from.
     */
    public void importFromUris(Reader r) {
        BufferedReader reader = new BufferedReader(r);
        String s;
        
        Iterator iter = 
            new RegexLineIterator(
                    new LineReadingIterator(reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);

        while (iter.hasNext()) {
            s = (String) iter.next();
            // s is a URI (or even fragmentary hostname), not a SURT
            addFromPlain(s);
        }
    }

    /**
     * Import SURT prefixes from a reader with mixed URI and SURT prefix
     * format. 
     * 
     * @param r  the reader to import the prefixes from
     * @param deduceFromSeeds   true to also import SURT prefixes implied
     *                          from normal URIs/hostname seeds
     */
    public void importFromMixed(Reader r, boolean deduceFromSeeds) {
        BufferedReader reader = new BufferedReader(r);
        String s;
        
        Iterator iter = 
            new RegexLineIterator(
                    new LineReadingIterator(reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);

        while (iter.hasNext()) {
            s = (String) iter.next();
            if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
                considerAsAddDirective(s.substring(SURT_PREFIX_DIRECTIVE.length()));
                continue; 
            } else {
                if(deduceFromSeeds) {
                    // also deducing 'implied' SURT prefixes 
                    // from normal URIs/hostname seeds
                    addFromPlain(s);
                }
            }
        }
    }
    

    /**
     * Interpret the given SURT/URI/host as a directive, returning true 
     * if it was meaningful. 
     * 
     * @param suri potential directive
     * @return boolean true if applied as directive, false otherwise
     */
    public boolean considerAsAddDirective(String suri) {
        String u = suri.trim();
        if(u.length()==0) {
            // empty string: consider a mistake
            return false; 
        }
        if(u.indexOf("(")>0) {
            // formal SURT prefix; toLowerCase just in case
            add(u.toLowerCase());
        } else {
            // hostname/normal form URI from which 
            // to deduce SURT prefix
            addFromPlain(u);
        }
        return true;
       
    }
    
    /**
     * Given a plain URI or hostname, deduce an implied SURT prefix from
     * it and add to active prefixes. 
     * 
     * @param u String of URI or hostname
     */
    public void addFromPlain(String u) {
        u = prefixFromPlainForceHttp(u);
        add(u);
    }

    /**
     * Given a plain URI or hostname/hostname+path, deduce an implied SURT 
     * prefix from it. Results may be unpredictable on strings that cannot
     * be interpreted as URIs. 
     * 
     * UURI 'fixup' is applied to the URI that is built. 
     * 
     * HTTPS URIs are changed to HTTP as a convenience for the usual
     * preferred common-treatment. 
     *
     * @param u URI or almost-URI to consider
     * @return implied SURT prefix form
     */
    public static String prefixFromPlainForceHttp(String u) {
        u = SURT.prefixFromPlain(u);
        u = coerceFromHttpsForComparison(u);
        return u;
    }

    /**
     * For SURT comparisons -- prefixes or candidates being checked against
     * those prefixes -- we treat https URIs as if they were http.
     * 
     * @param u string to coerce if it has https scheme
     * @return string converted to http scheme, or original if not necessary
     */
    private static String coerceFromHttpsForComparison(String u) {
        if (u.startsWith("https://")) {
            u = "http" + u.substring("https".length());
        }
        return u;
    }

    /**
     * Utility method for truncating a SURT that came from a 
     * full URI (as a seed, for example) into a prefix
     * for determining inclusion.
     * 
     * This involves: 
     * 
     *    (1) removing the last path component, if any
     *        (anything after the last '/', if there are
     *        at least 3 '/'s)
     *    (2) removing a trailing ')', if present, opening
     *        the possibility of proper subdomains. (This
     *        means that the presence or absence of a
     *        trailing '/' after a hostname in a seed list
     *        is significant for the how the SURT prefix is 
     *        created, even though it is not signficant for 
     *        the URI's treatment as a seed.)
     * 
* * @param s String to work on. * @return As prefix. */ public static String asPrefix(String s) { // Strip last path-segment, if more than 3 slashes s = s.replaceAll("^(.*//.*/)[^/]*","$1"); // Strip trailing ")", if present and NO path (no 3rd slash). if (!s.endsWith("/")) { s = s.replaceAll("^(.*)\\)","$1"); } return s; } /** * Calculate the SURT form URI to use as a candidate against prefixes * from the given Object (CandidateURI or UURI) * * @param object CandidateURI or UURI * @return SURT form of URI for evaluation, or null if unavailable */ public static String getCandidateSurt(UsableURI u) { if (u == null) { return null; } String candidateSurt = u.getSurtForm(); // also want to treat https as http candidateSurt = coerceFromHttpsForComparison(candidateSurt); return candidateSurt; } /** * @param fw * @throws IOException */ public void exportTo(FileWriter fw) throws IOException { Iterator iter = this.iterator(); while(iter.hasNext()) { fw.write((String)iter.next() + "\n"); } } /** * Changes all prefixes so that they enforce an exact host. For * prefixes that already include a ')', this means discarding * anything after ')' (path info). For prefixes that don't include * a ')' -- domain prefixes open to subdomains -- add the closing * ')' (or ",)"). */ public void convertAllPrefixesToHosts() { SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone(); Iterator iter = iterCopy.iterator(); while (iter.hasNext()) { String prefix = (String) iter.next(); String convPrefix = convertPrefixToHost(prefix); if(prefix!=convPrefix) { // if returned value not unchanged, update set this.remove(prefix); this.add(convPrefix); } } } public static String convertPrefixToHost(String prefix) { if(prefix.endsWith(")")) { return prefix; // no change necessary } if(prefix.indexOf(')')<0) { // open-ended domain prefix if(!prefix.endsWith(",")) { prefix += ","; } prefix += ")"; } else { // prefix with excess path-info prefix = prefix.substring(0,prefix.indexOf(')')+1); } return prefix; } /** * Changes all prefixes so that they only enforce a general * domain (allowing subdomains).For prefixes that don't include * a ')', no change is necessary. For others, truncate everything * from the ')' onward. Additionally, truncate off "www," if it * appears. */ public void convertAllPrefixesToDomains() { SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone(); Iterator iter = iterCopy.iterator(); while (iter.hasNext()) { String prefix = (String) iter.next(); String convPrefix = convertPrefixToDomain(prefix); if(prefix!=convPrefix) { // if returned value not unchanged, update set this.remove(prefix); this.add(convPrefix); } } } public static String convertPrefixToDomain(String prefix) { if(prefix.indexOf(')')>=0) { prefix = prefix.substring(0,prefix.indexOf(')')); } // strip 'www,' when present if(prefix.endsWith("www,")) { prefix = prefix.substring(0,prefix.length()-4); } return prefix; } /** * Allow class to be used as a command-line tool for converting * URL lists (or naked host or host/path fragments implied * to be HTTP URLs) to implied SURT prefix form. * * Read from stdin or first file argument. Writes to stdout. * * @param args cmd-line arguments: may include input file * @throws IOException */ public static void main(String[] args) throws IOException { InputStream in = args.length > 0 ? new BufferedInputStream( new FileInputStream(args[0])) : System.in; PrintStream out = args.length > 1 ? new PrintStream( new BufferedOutputStream(new FileOutputStream(args[1]))) : System.out; BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; while((line = br.readLine())!=null) { if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); line = line.trim(); if(line.length()==0) continue; out.println(prefixFromPlainForceHttp(line)); } br.close(); out.close(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy