All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ecfront.kwe.KeyWordExtract Maven / Gradle / Ivy

There is a newer version: 1.6
Show newest version
package com.ecfront.kwe;

import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.nio.file.Files;
import java.util.*;
import java.util.stream.Collectors;

public class KeyWordExtract {

    private static Map> RULES = new HashMap<>();
    private static final String LOCAL_RULE_FILE = "kwe-rules.txt";
    private static final ScriptEngineManager SCRIPT_ENGINE_MANAGER = new ScriptEngineManager();
    private static final ScriptEngine jsEngine = SCRIPT_ENGINE_MANAGER.getEngineByName("nashorn");

    static {
        try {
            loadRules(Helper.readAllByClassPath(LOCAL_RULE_FILE));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static String extract(String url) {
        try {
            URL targetUrl = new URL(url);
            Set parsers = RULES.getOrDefault(targetUrl.getHost(), new HashSet<>());
            for (Parser parser : parsers) {
                Optional matched = parser.parse(targetUrl.getPath(), targetUrl.getQuery());
                if (matched.isPresent()) {
                    return matched.get();
                }
            }
            return "";
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static void loadOnlineRules(String ruleUrl) throws IOException {
        loadRules(Helper.httpGet(ruleUrl));
    }

    private static void loadRules(List rules) {
        rules.forEach(rule -> {
            String[] items = rule.split("\\|");
            if (items.length == 5 || items.length == 2) {
                String host = items[0];
                if (!RULES.containsKey(host)) {
                    RULES.put(host, new HashSet<>());
                }
                RULES.get(host).add(new Parser(items));
            }
        });
    }

    private static class Parser {

        private boolean wdInQuery;
        private int pathIndex;
        private String queryKey;
        private String codec;
        private String enc;
        private String jsFun;

        private Parser(String[] items) {
            if (items.length == 5) {
                wdInQuery = items[1].equalsIgnoreCase("query");
                if (wdInQuery) {
                    queryKey = items[2];
                } else {
                    pathIndex = Integer.valueOf(items[2]);
                }
                codec = items[3];
                enc = items[4];
            } else {
                try {
                    jsFun = items[0].replaceAll(".", "_") + "_" + Math.abs(items[1].hashCode());
                    String js = "function " + jsFun + "(uri){\r\n" +
                            "var result = '';\r\n" +
                            items[1] + ";\r\n" +
                            "return result;\r\n" +
                            "}\r\n";
                    jsEngine.eval(js);
                } catch (ScriptException e) {
                    throw new RuntimeException(e);
                }
            }
        }

        private Optional parse(String path, String query) throws UnsupportedEncodingException {
            if (jsFun == null) {
                if (wdInQuery) {
                    String[] queryItems = query.split("&");
                    for (String queryItem : queryItems) {
                        if (queryItem.startsWith(queryKey + '=')) {
                            return Optional.of(parse(queryItem.substring(queryKey.length() + 1)));
                        }
                    }
                } else {
                    String[] pathItems = path.split("/");
                    if (pathItems.length > pathIndex) {
                        return Optional.of(parse(pathItems[pathIndex + 1]));
                    }
                }
                return Optional.empty();
            } else {
                try {
                    return Optional.of((String) ((Invocable) jsEngine).invokeFunction(jsFun, path + query));
                } catch (ScriptException | NoSuchMethodException e) {
                    throw new RuntimeException(e);
                }
            }
        }

        private String parse(String encodeValue) throws UnsupportedEncodingException {
            switch (codec.toLowerCase()) {
                case "decodeuri":
                    return URLDecoder.decode(encodeValue, enc);
                default:
                    throw new RuntimeException("Decoder[" + codec + "] NOT Exist.");
            }
        }

    }

    private static class Helper {

        private static List readAllByClassPath(String classpath) throws IOException {
            File file = new File(KeyWordExtract.class.getResource("/").getPath() + classpath);
            if (file.exists()) {
                return Files.readAllLines(file.toPath());
            }
            InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(classpath);
            BufferedReader buffer = new BufferedReader(new InputStreamReader(in));
            return buffer.lines().collect(Collectors.toList());
        }

        private static List httpGet(String url) throws IOException {
            URL getUrl = new URL(url);
            URLConnection connection = getUrl.openConnection();
            connection.setRequestProperty("accept", "*/*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("user-agent",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            connection.connect();
            BufferedReader buffer = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            return buffer.lines().collect(Collectors.toList());
        }

    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy