All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.net.UrlTokenizer Maven / Gradle / Ivy

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.net;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * @author Simon Thoresen Hult
 */
public class UrlTokenizer {

    public static final String TERM_STARTHOST = "StArThOsT";
    public static final String TERM_ENDHOST = "EnDhOsT";

    private static final Map schemeToPort = new HashMap<>();
    private static final Map portToScheme = new HashMap<>();
    private static final char TO_LOWER = (char)('A' - 'a');
    private final Url url;

    static {
        registerScheme("ftp", 21);
        registerScheme("gopher", 70);
        registerScheme("http", 80);
        registerScheme("https", 443);
        registerScheme("imap", 143);
        registerScheme("mailto", 25);
        registerScheme("news", 119);
        registerScheme("nntp", 119);
        registerScheme("pop", 110);
        registerScheme("rsync", 873);
        registerScheme("rtsp", 554);
        registerScheme("sftp", 22);
        registerScheme("shttp", 443);
        registerScheme("sip", 5060);
        registerScheme("sips", 5061);
        registerScheme("snmp", 161);
        registerScheme("ssh", 22);
        registerScheme("telnet", 23);
        registerScheme("tftp", 69);
    }

    public UrlTokenizer(String url) {
        this(Url.fromString(url));
    }

    public UrlTokenizer(Url url) {
        this.url = url;
    }

    private String guessScheme(String port) {
        String scheme = portToScheme.get(port);
        if (scheme != null) {
            return scheme;
        }
        return "http";
    }

    private String guessPort(String scheme) {
        String port = schemeToPort.get(scheme);
        if (port != null) {
            return port;
        }
        return null;
    }

    public List tokenize() {
        List lst = new LinkedList<>();

        int offset = 0;
        String port = url.getPortString();
        String scheme = url.getScheme();
        if (scheme == null) {
            scheme = guessScheme(port);
            addTokens(lst, UrlToken.Type.SCHEME, offset, scheme, false);
        } else {
            addTokens(lst, UrlToken.Type.SCHEME, url.getSchemeBegin(), scheme, true);
            offset = url.getSchemeEnd();
        }

        String userInfo = url.getUserInfo();
        if (userInfo != null) {
            addTokens(lst, UrlToken.Type.USERINFO, url.getUserInfoBegin(), userInfo, true);
            offset = url.getUserInfoEnd();
        }

        String password = url.getPassword();
        if (password != null) {
            addTokens(lst, UrlToken.Type.PASSWORD, url.getPasswordBegin(), password, true);
            offset = url.getPasswordEnd();
        }

        String host = url.getHost();
        if (host == null || host.isEmpty()) {
            if (host != null) {
                offset = url.getHostBegin();
            }
            if ("file".equalsIgnoreCase(scheme)) {
                addHostTokens(lst, offset, offset, "localhost", false);
            }
        } else {
            addHostTokens(lst, url.getHostBegin(), url.getHostEnd(), host, true);
            offset = url.getHostEnd();
        }

        port = url.getPortString();
        if (port == null) {
            if ((port = guessPort(scheme)) != null) {
                addTokens(lst, UrlToken.Type.PORT, offset, port, false);
            }
        } else {
            addTokens(lst, UrlToken.Type.PORT, url.getPortBegin(), port, true);
        }

        String path = url.getPath();
        if (path != null) {
            addTokens(lst, UrlToken.Type.PATH, url.getPathBegin(), path, true);
        }

        String query = url.getQuery();
        if (query != null) {
            addTokens(lst, UrlToken.Type.QUERY, url.getQueryBegin(), query, true);
        }

        String fragment = url.getFragment();
        if (fragment != null) {
            addTokens(lst, UrlToken.Type.FRAGMENT, url.getFragmentBegin(), fragment, true);
        }

        return lst;
    }

    public static void addTokens(List lst, UrlToken.Type type, int offset, String image, boolean orig) {
        StringBuilder term = new StringBuilder();
        int prev = 0;
        for (int skip, next = 0, len = image.length(); next < len; next += skip) {
            char c = image.charAt(next);
            if (c == '%') {
                c = (char)Integer.parseInt(image.substring(next + 1, next + 3), 16);
                skip = 3;
            } else {
                skip = 1;
            }
            if ((c >= '0' && c <= '9') ||
                (c >= 'a' && c <= 'z') ||
                (c == '-' || c == '_'))
            {
                term.append(c);
            } else if (c >= 'A' && c <= 'Z') {
                term.append((char)(c - TO_LOWER));
            } else {
                if (prev < next) {
                    lst.add(new UrlToken(type, offset + (orig ? prev : 0), orig ? image.substring(prev, next) : null,
                                         term.toString()));
                    term = new StringBuilder();
                }
                prev = next + skip;
            }
        }
        if (term.length() > 0) {
            lst.add(new UrlToken(type, offset + (orig ? prev : 0), orig ? image.substring(prev) : null,
                                 term.toString()));
        }
    }

    private static void addHostTokens(List lst, int begin, int end, String image, boolean orig) {
        lst.add(new UrlToken(UrlToken.Type.HOST, begin, null, TERM_STARTHOST));
        addTokens(lst, UrlToken.Type.HOST, begin, image, orig);
        lst.add(new UrlToken(UrlToken.Type.HOST, end, null, TERM_ENDHOST));
    }

    private static void registerScheme(String scheme, int port) {
        String str = String.valueOf(port);
        schemeToPort.put(scheme, str);
        portToScheme.put(str, scheme);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy