org.opensextant.util.FileUtility Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensextant-xponents-core Show documentation
An information extraction toolkit focused on geography and temporal entities
There is a newer version: 3.7.3
Show newest version
/*
 *
 * Copyright 2012-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____                                ____                     __                       __
///\  __`\                             /\  _`\                  /\ \__                   /\ \__
//\ \ \/\ \   _____      __     ___    \ \,\L\_\      __   __  _\ \ ,_\     __       ___ \ \ ,_\
// \ \ \ \ \ /\ '__`\  /'__`\ /' _ `\   \/_\__ \    /'__`\/\ \/'\\ \ \/   /'__`\   /' _ `\\ \ \/
//  \ \ \_\ \\ \ \L\ \/\  __/ /\ \/\ \    /\ \L\ \ /\  __/\/>  = 'A' && c <= 'Z') {
            return c;
        }
        if (c >= 'a' && c <= 'z') {
            return c;
        }
        if (c >= '0' && c <= '9') {
            return c;
        }
        if (c == '_' || c == '-') {
            return c;
        } else {
            return FILENAME_REPLACE_CHAR;
        }
    }

    /**
     * A way of determining OS Beware, OS X has Darwin in its full OS name.
     *
     * @return if OS is windows-based
     */
    public static boolean isWindowsSystem() {
        final String val = System.getProperty("os.name");

        /**
         * if (val == null) { //log.warn("Could not verify OS name"); return false; } else {
         * //log.debug("Operating System is " + val); }
         */
        return (val != null ? val.contains("Windows") : false);
    }

    /**
     * Char used in config files, dict files.
     */
    public static final String COMMENT_CHAR = "#";

    /**
     * A generic word list loader.
     *
     * @param resourcepath   classpath location of a resource
     * @param case_sensitive if terms are loaded with case preserved or not.
     * @author ubaldino, MITRE Corp
     * @return Set containing unique words found in resourcepath
     * @throws IOException on error, resource does not exist
     */
    public static Set loadDictionary(String resourcepath, boolean case_sensitive) throws IOException {
        InputStream io = FileUtility.class.getResourceAsStream(resourcepath);
        if (io == null) {
            throw new IOException("Resource " + resourcepath + "  was not found");
        }
        return loadDict(io, case_sensitive);
    }

    /**
     * A generic word list loader.
     *
     * @param resourcepath   classpath location of a resource
     * @param case_sensitive if terms are loaded with case preserved or not.
     * @author ubaldino, MITRE Corp
     * @return Set containing unique words found in resourcepath
     * @throws IOException on error, resource does not exist
     */
    public static Set loadDictionary(URL resourcepath, boolean case_sensitive) throws IOException {
        return loadDict(resourcepath.openStream(), case_sensitive);
    }

    /**
     * The do all method. Load the dictionary from stream This closes the stream when done.
     * 
     * @param io             stream
     * @param case_sensitive true if data should be loaded preserving case
     * @return set of phrases from file.
     * @throws IOException on IO error
     */
    public static Set loadDict(InputStream io, boolean case_sensitive) throws IOException {

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(io, default_encoding))) {

            final Set dict = new HashSet();
            String newline = null;
            String test = null;
            while ((newline = reader.readLine()) != null) {
                test = newline.trim();
                if (test.startsWith(COMMENT_CHAR) || test.length() == 0) {
                    continue;
                }
                if (case_sensitive) {
                    dict.add(test);
                } else {
                    dict.add(test.toLowerCase());
                }
            }
            return dict;
        }
    }

    /**
     * Load a word list from a file path.
     *
     * @param resourcepath   File object to load
     * @param case_sensitive if dictionary is loaded with case or not.
     * @return a Set object containing distinct dictionary terms
     * @throws IOException if load fails
     */
    public static Set loadDictionary(File resourcepath, boolean case_sensitive) throws IOException {
        try (InputStream io = new FileInputStream(resourcepath)) {
            return loadDict(io, case_sensitive);
        }
    }

    //
    //
    //  Working with file types
    //
    //
    private static final HashMap filetypeMap = new HashMap();
    public static final String IMAGE_MIMETYPE = "image";
    public static final String DOC_MIMETYPE = "document";
    public static final String MESSAGE_MIMETYPE = "message";
    public static final String APP_MIMETYPE = "application";
    public static final String VID_MIMETYPE = "video";
    public static final String AUD_MIMETYPE = "audio";
    public static final String FOLDER_MIMETYPE = "folder";
    public static final String FEED_MIMETYPE = "feed";
    public static final String DATA_MIMETYPE = "data";
    public static final String WEBARCHIVE_MIMETYPE = "web archive";
    public static final String WEBPAGE_MIMETYPE = "web page";
    public static final String SPREADSHEET_MIMETYPE = "spreadsheet";
    public static final String NOT_AVAILABLE = "other";
    public static final String GIS_MIMETYPE = "GIS data";

    private static final HashMap imageTypeMap = new HashMap();

    static {

        // Image
        imageTypeMap.put("jpg", IMAGE_MIMETYPE);
        imageTypeMap.put("jpeg", IMAGE_MIMETYPE);
        imageTypeMap.put("jp2", IMAGE_MIMETYPE);
        imageTypeMap.put("jpx", IMAGE_MIMETYPE);
        imageTypeMap.put("ico", IMAGE_MIMETYPE);
        imageTypeMap.put("bmp", IMAGE_MIMETYPE);
        imageTypeMap.put("gif", IMAGE_MIMETYPE);
        imageTypeMap.put("png", IMAGE_MIMETYPE);
        imageTypeMap.put("tif", IMAGE_MIMETYPE);
        imageTypeMap.put("tiff", IMAGE_MIMETYPE);
        filetypeMap.putAll(imageTypeMap);

        filetypeMap.put("", NOT_AVAILABLE);

        // GIS Data
        filetypeMap.put("gdb", GIS_MIMETYPE);
        filetypeMap.put("shp", GIS_MIMETYPE);
        filetypeMap.put("kml", GIS_MIMETYPE);
        filetypeMap.put("kmz", GIS_MIMETYPE);

        // Data
        filetypeMap.put("dat", DATA_MIMETYPE);
        filetypeMap.put("xml", DATA_MIMETYPE);
        filetypeMap.put("rdf", DATA_MIMETYPE);

        // Archive
        filetypeMap.put("mht", WEBARCHIVE_MIMETYPE);
        filetypeMap.put("mhtml", WEBARCHIVE_MIMETYPE);

        filetypeMap.put("csv", SPREADSHEET_MIMETYPE);
        filetypeMap.put("xls", SPREADSHEET_MIMETYPE);
        filetypeMap.put("xlsx", SPREADSHEET_MIMETYPE);

        filetypeMap.put("htm", WEBPAGE_MIMETYPE);
        filetypeMap.put("html", WEBPAGE_MIMETYPE);

        // Docs
        filetypeMap.put("odf", DOC_MIMETYPE);
        filetypeMap.put("doc", DOC_MIMETYPE);
        filetypeMap.put("ppt", DOC_MIMETYPE);
        filetypeMap.put("pdf", DOC_MIMETYPE);
        filetypeMap.put("ps", DOC_MIMETYPE);
        filetypeMap.put("vsd", DOC_MIMETYPE);
        filetypeMap.put("txt", DOC_MIMETYPE);
        filetypeMap.put("pptx", DOC_MIMETYPE);
        filetypeMap.put("docx", DOC_MIMETYPE);

        // Messages
        filetypeMap.put("eml", MESSAGE_MIMETYPE);
        filetypeMap.put("emlx", MESSAGE_MIMETYPE);
        filetypeMap.put("msg", MESSAGE_MIMETYPE);
        filetypeMap.put("sms", MESSAGE_MIMETYPE);

        //Apps
        filetypeMap.put("do", APP_MIMETYPE);
        filetypeMap.put("aspx", APP_MIMETYPE);
        filetypeMap.put("asp", APP_MIMETYPE);
        filetypeMap.put("axd", APP_MIMETYPE);
        filetypeMap.put("js", APP_MIMETYPE);
        filetypeMap.put("php", APP_MIMETYPE);
        filetypeMap.put("vbs", APP_MIMETYPE);
        filetypeMap.put("vb", APP_MIMETYPE);
        filetypeMap.put("vba", APP_MIMETYPE);

        // Video
        filetypeMap.put("mov", VID_MIMETYPE);

        filetypeMap.put("rm", VID_MIMETYPE);
        filetypeMap.put("wmv", VID_MIMETYPE);
        filetypeMap.put("mp4", VID_MIMETYPE);
        filetypeMap.put("mpeg", VID_MIMETYPE);
        filetypeMap.put("mpg", VID_MIMETYPE);

        // Audio
        filetypeMap.put("au", AUD_MIMETYPE);
        filetypeMap.put("wma", AUD_MIMETYPE);
        filetypeMap.put("mp3", AUD_MIMETYPE);
        filetypeMap.put("ra", AUD_MIMETYPE);

        // Data Feed
        filetypeMap.put("rss", FEED_MIMETYPE);
    }

    /**
     * Get a plain language name of the type of file. E.g., document, image, spreadsheet, web page.
     * Rather than the MIME type technical descriptor.
     * 
     * @param url item to describe
     * @return plain language description of the URL
     */
    public static String getFileDescription(String url) {
        if (url == null) {
            return NOT_AVAILABLE;
        }

        //------------

        /*  path:   http://a/b.htm
         *
         */
        final String test = url.toLowerCase();

        /*  path:   /a/b/
         *
         */
        if (url.endsWith("/") && !test.startsWith("http")) {
            return FOLDER_MIMETYPE;
        }

        final String urlTestExtension = FilenameUtils.getExtension(test);

        /*
         * Known file type.
         */
        final String urlMimeType = filetypeMap.get(urlTestExtension);
        if (urlMimeType != null) {
            return urlMimeType;
        }

        /*
         * path:  .../abc.rss
         */
        if (test.contains("rss")) {
            return FEED_MIMETYPE;
        }

        if (test.startsWith("http:") || test.startsWith("https:")) {
            return WEBPAGE_MIMETYPE;
        }

        /*
         *   path:   /some/default/path
         */
        if (url.contains("/")) {
            return FOLDER_MIMETYPE;
        }

        /*
         * Give up.
         */
        return NOT_AVAILABLE;
    }

    /**
     * Check if path or URL is a webpage. This is helpful for looking at found URLs in unstructured
     * data.
     *
     * @param link a URL
     * @return true if link looks like a URL (ie., if it starts with http: or https:)
     */
    public static boolean isWebURL(String link) {
        if (link == null) {
            return false;
        }
        String test = link.toLowerCase();
        if (test.startsWith("http:") || test.startsWith("https:")) {
            return true;
        }
        return false;
    }

    /**
     * Tell if the file is JSON/Gzip
     * 
     * @param path input file path
     * @return true if is file ends with json.gz or contains json and ends with .gz
     */
    public static boolean isJSONGzip(String path) {
        String test = path.toLowerCase();
        if (test.endsWith("json.gz")) {
            return true;
        }
        if (test.contains("json") && test.endsWith(".gz")) {
            return true;
        }
        return false;
    }
}