All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dspace.statistics.util.StatisticsImporter Maven / Gradle / Ivy

The newest version!
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.statistics.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.InetAddress;
import java.sql.SQLException;
import java.text.DecimalFormat;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Random;
import java.util.UUID;

import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.exception.GeoIp2Exception;
import com.maxmind.geoip2.model.CityResponse;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
import org.dspace.content.Bitstream;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.DSpaceObjectLegacySupportService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.statistics.factory.StatisticsServiceFactory;
import org.dspace.statistics.service.SolrLoggerService;

/**
 * Class to load intermediate statistics files (produced from log files by {@link ClassicDSpaceLogConverter}) into Solr.
 *
 * @author Stuart Lewis
 * @see ClassicDSpaceLogConverter
 */
public class StatisticsImporter {
    private static final Logger log = LogManager.getLogger(StatisticsImporter.class);

    protected final SolrLoggerService solrLoggerService
            = StatisticsServiceFactory.getInstance().getSolrLoggerService();
    protected static final ConfigurationService configurationService
            = DSpaceServicesFactory.getInstance().getConfigurationService();

    /**
     * Solr server connection
     */
    private static HttpSolrClient solr;

    /**
     * GEOIP lookup service
     */
    private static DatabaseReader geoipLookup;

    /**
     * Whether to skip the DNS reverse lookup or not
     */
    private static boolean skipReverseDNS = false;

    /**
     * Local items
     */
    private List localItems;

    /**
     * Local collections
     */
    private List localCollections;

    /**
     * Local communities
     */
    private List localCommunities;

    /**
     * Local bitstreams
     */
    private List localBitstreams;

    /**
     * Whether or not to replace item IDs with local values (for testing)
     */
    private final boolean useLocal;

    protected final BitstreamService bitstreamService;
    protected final CollectionService collectionService;
    protected final CommunityService communityService;
    protected final ItemService itemService;

    /**
     * Constructor. Optionally loads local data to replace foreign data
     * if using someone else's log files
     *
     * @param local Whether to use local data
     */
    public StatisticsImporter(boolean local) {
        bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
        collectionService = ContentServiceFactory.getInstance().getCollectionService();
        communityService = ContentServiceFactory.getInstance().getCommunityService();
        itemService = ContentServiceFactory.getInstance().getItemService();
        // Setup the lists of communities, collections, items & bitstreams if required
        useLocal = local;
        if (local) {
            try {
                ContentServiceFactory contentServiceFactory = ContentServiceFactory.getInstance();
                System.out.print("Loading local communities... ");

                Context c = new Context();
                List communities = communityService.findAll(c);
                localCommunities = new ArrayList<>();
                for (Community community : communities) {
                    localCommunities.add(community.getID());
                }
                System.out.println("Found " + localCommunities.size());

                System.out.print("Loading local collections... ");
                List collections = collectionService.findAll(c);
                localCollections = new ArrayList<>();
                for (Collection collection : collections) {
                    localCollections.add(collection.getID());
                }
                System.out.println("Found " + localCollections.size());

                System.out.print("Loading local items... ");
                Iterator items = itemService.findAll(c);
                localItems = new ArrayList<>();
                Item i;
                while (items.hasNext()) {
                    i = items.next();
                    localItems.add(i.getID());
                }
                System.out.println("Found " + localItems.size());

                System.out.print("Loading local bitstreams... ");
                List bitstreams = bitstreamService.findAll(c);
                localBitstreams = new ArrayList<>();
                for (Bitstream bitstream : bitstreams) {
                    if (bitstream.getName() != null) {
                        localBitstreams.add(bitstream.getID());
                    }
                }
                System.out.println("Found " + localBitstreams.size());

            } catch (SQLException e) {
                System.err.println("Error retrieving items from DSpace database:");
                e.printStackTrace();
                System.exit(1);
            }
        }
    }

    /**
     * Read lines from the statistics file and load their data into solr.
     *
     * @param filename The filename of the file to load
     * @param context  The DSpace Context
     * @param verbose  Whether to display verbose output
     */
    protected void load(String filename, Context context, boolean verbose) {
        // Item counter
        int counter = 0;
        int errors = 0;
        int searchengines = 0;

        try {
            BufferedReader input;
            if (null == filename || "-".equals(filename)) {
                input = new BufferedReader(new InputStreamReader(System.in));
                filename = "standard input";
            } else {
                input = new BufferedReader(new FileReader(new File(filename)));
            }

            // Print out the filename for confirmation
            System.out.println("Processing file: " + filename);

            String line;
//            String uuid;
            String action;
            String id;
            Instant date;
            String user;
            String ip;

            String continent = "";
            String country = "";
            String countryCode = "";
            double longitude = 0f;
            double latitude = 0f;
            String city = "";
            String dns;

            DNSCache dnsCache = new DNSCache(2500, 0.75f, 2500);
            Object fromCache;
            Random rand = new Random();

            while ((line = input.readLine()) != null) {
                // Tokenise the line
                String data = "";
                counter++;
                errors++;
                if (verbose) {
                    System.out.println("Line:" + line);
                }
                String[] parts = line.split(",");
//                uuid = parts[0];
                action = parts[1];
                id = parts[2];
                // Date format (for solr)
                date = LocalDateTime.parse(parts[3]).toInstant(ZoneOffset.UTC);
                user = parts[4];
                ip = parts[5];

                // Resolve the dns (if applicable) to get rid of search engine bots early on in the processing chain
                dns = "";
                if (!skipReverseDNS) {
                    // Is the IP address in the cache?
                    fromCache = dnsCache.get(ip);
                    if (fromCache != null) {
                        dns = (String) fromCache;
                    } else {
                        try {
                            dns = DnsLookup.reverseDns(ip);
                            dnsCache.put(ip, dns);
                        } catch (IOException e) {
                            dns = "";
                        }
                    }
                }

                data += ("ip addr = " + ip);
                data += (", dns name = " + dns);
                if ((dns.endsWith(".googlebot.com.")) ||
                    (dns.endsWith(".crawl.yahoo.net.")) ||
                    (dns.endsWith(".search.msn.com."))) {
                    if (verbose) {
                        System.out.println(data + ", IGNORE (search engine)");
                    }
                    errors--;
                    searchengines++;
                    continue;
                }

                // Get the geo information for the user
                try {
                    InetAddress ipAddress = InetAddress.getByName(ip);
                    CityResponse cityResponse = geoipLookup.city(ipAddress);
                    city = cityResponse.getCity().getName();
                    country = cityResponse.getCountry().getName();
                    countryCode = cityResponse.getCountry().getIsoCode();
                    longitude = cityResponse.getLocation().getLongitude();
                    latitude = cityResponse.getLocation().getLatitude();
                    if (verbose) {
                        data += (", country = " + country);
                        data += (", city = " + city);
                        System.out.println(data);
                    }
                    try {
                        continent = LocationUtils.getContinentCode(countryCode);
                    } catch (Exception e) {
                        if (verbose) {
                            System.out.println("Unknown country code: " + countryCode);
                        }
                        continue;
                    }
                } catch (GeoIp2Exception | IOException e) {
                    // No problem - just can't look them up
                }

                // Now find our dso
                ContentServiceFactory contentServiceFactory = ContentServiceFactory.getInstance();
                DSpaceObjectLegacySupportService legacySupportService = null;
                if ("view_bitstream".equals(action)) {
                    legacySupportService = contentServiceFactory.getBitstreamService();
                    if (useLocal) {
                        id = "" + localBitstreams.get(rand.nextInt(localBitstreams.size()));
                    }
                } else if ("view_item".equals(action)) {
                    legacySupportService = contentServiceFactory.getItemService();
                    if (useLocal) {
                        id = "" + localItems.get(rand.nextInt(localItems.size()));
                    }
                } else if ("view_collection".equals(action)) {
                    legacySupportService = contentServiceFactory.getCollectionService();
                    if (useLocal) {
                        id = "" + localCollections.get(rand.nextInt(localCollections.size()));
                    }
                } else if ("view_community".equals(action)) {
                    legacySupportService = contentServiceFactory.getCommunityService();
                    if (useLocal) {
                        id = "" + localCommunities.get(rand.nextInt(localCommunities.size()));
                    }
                }
                if (legacySupportService == null) {
                    continue;
                }

                DSpaceObject dso = legacySupportService.findByIdOrLegacyId(context, id);
                if (dso == null) {
                    if (verbose) {
                        System.err.println(" - DSO with ID '" + id + "' is no longer in the system");
                    }
                    continue;
                }

                // Get the eperson details
                EPerson eperson = EPersonServiceFactory.getInstance().getEPersonService().findByEmail(context, user);
                UUID epersonId = null;
                if (eperson != null) {
                    epersonId = eperson.getID();
                }

                // Save it in our server
                SolrInputDocument sid = new SolrInputDocument();
                sid.addField("ip", ip);
                sid.addField("type", dso.getType());
                sid.addField("id", dso.getID().toString());
                sid.addField("time", date.toString());
                sid.addField("continent", continent);
                sid.addField("country", country);
                sid.addField("countryCode", countryCode);
                sid.addField("city", city);
                sid.addField("latitude", latitude);
                sid.addField("longitude", longitude);
                if (epersonId != null) {
                    sid.addField("epersonid", epersonId);
                }
                sid.addField("dns", dns.toLowerCase());

                solrLoggerService.storeParents(sid, dso);
                solr.add(sid);
                errors--;
            }

        } catch (RuntimeException re) {
            throw re;
        } catch (IOException | SQLException | SolrServerException e) {
            System.err.println(e.getMessage());
            log.error(e.getMessage(), e);
        }

        DecimalFormat percentage = new DecimalFormat("##.###");
        int committed = counter - errors - searchengines;
        System.out.println("Processed " + counter + " log lines");
        if (counter > 0) {
            Double committedpercentage = 100d * committed / counter;
            System.out
                .println(" - " + committed + " entries added to solr: " + percentage.format(committedpercentage) + "%");
            Double errorpercentage = 100d * errors / counter;
            System.out.println(" - " + errors + " errors: " + percentage.format(errorpercentage) + "%");
            Double sepercentage = 100d * searchengines / counter;
            System.out.println(
                " - " + searchengines + " search engine activity skipped: " + percentage.format(sepercentage) + "%");
            System.out.print("About to commit data to solr...");

            // Commit at the end because it takes a while
            try {
                solr.commit();
            } catch (SolrServerException sse) {
                System.err.println("Error committing statistics to solr server!");
                sse.printStackTrace();
                System.exit(1);
            } catch (IOException ioe) {
                System.err.println("Error writing to solr server!");
                ioe.printStackTrace();
                System.exit(1);
            }
        }
        System.out.println(" done!");
    }

    /**
     * Print the help message
     *
     * @param options  The command line options the user gave
     * @param exitCode the system exit code to use
     */
    private static void printHelp(Options options, int exitCode) {
        // print the help message
        HelpFormatter myhelp = new HelpFormatter();
        myhelp.printHelp("StatisticsImporter\n", options);
        System.exit(exitCode);
    }

    /**
     * Main method to run the statistics importer.
     *
     * @param args the command line arguments given
     * @throws Exception If something goes wrong
     */
    public static void main(String[] args) throws Exception {
        CommandLineParser parser = new DefaultParser();

        Options options = new Options();

        options.addOption("i", "in", true,
                          "the input file ('-' or omit for standard input)");
        options.addOption("l", "local", false,
                          "developers tool - map external log file to local handles");
        options.addOption("m", "multiple", false,
                          "treat the input file as having a wildcard ending");
        options.addOption("s", "skipdns", false,
                          "skip performing reverse DNS lookups on IP addresses");
        options.addOption("v", "verbose", false,
                          "display verbose output (useful for debugging)");
        options.addOption("h", "help", false,
                          "help");

        CommandLine line = parser.parse(options, args);

        // Did the user ask to see the help?
        if (line.hasOption('h')) {
            printHelp(options, 0);
        }

        if (line.hasOption('s')) {
            skipReverseDNS = true;
        }

        // Whether or not to convert handles to handles used in a local system
        // (useful if using someone else's log file for testing)
        boolean local = line.hasOption('l');

        // We got all our parameters now get the rest
        Context context = new Context();

        // Verbose option
        boolean verbose = line.hasOption('v');

        // Find our solr server
        String sserver = configurationService.getProperty("solr-statistics.server");
        if (verbose) {
            System.out.println("Writing to solr server at: " + sserver);
        }
        solr = new HttpSolrClient.Builder(sserver).build();

        String dbPath = configurationService.getProperty("usage-statistics.dbfile");
        try {
            File dbFile = new File(dbPath);
            geoipLookup = new DatabaseReader.Builder(dbFile).build();
        } catch (FileNotFoundException fe) {
            log.error(
                "The GeoLite Database file is missing (" + dbPath + ")! Solr Statistics cannot generate location " +
                    "based reports! Please see the DSpace installation instructions for instructions to install this " +
                    "file.",
                fe);
        } catch (IOException e) {
            log.error(
                "Unable to load GeoLite Database file (" + dbPath + ")! You may need to reinstall it. See the DSpace " +
                    "installation instructions for more details.",
                e);
        } catch (NullPointerException e) {
            log.error(
                    "The value of the property usage-statistics.dbfile is null. You may need to install the GeoLite " +
                    "Database file and/or uncomment the property in the config file!",
                e);
        }


        StatisticsImporter si = new StatisticsImporter(local);
        if (line.hasOption('m')) {
            // Convert all the files
            final File sample = new File(line.getOptionValue('i'));
            File dir = sample.getParentFile();
            FilenameFilter filter = new FilenameFilter() {
                @Override
                public boolean accept(File dir, String name) {
                    return name.startsWith(sample.getName());
                }
            };
            String[] children = dir.list(filter);
            for (String in : children) {
                System.out.println(in);
                si.load(dir.getAbsolutePath() + System.getProperty("file.separator") + in, context, verbose);
            }
        } else {
            // Just convert the one file
            si.load(line.getOptionValue('i'), context, verbose);
        }
    }


    /**
     * Inner class to hold a cache of reverse lookups of IP addresses
     *
     * @param  key type.
     * @param  value type.
     */
    static class DNSCache extends LinkedHashMap {
        private final int maxCapacity;

        public DNSCache(int initialCapacity, float loadFactor, int maxCapacity) {
            super(initialCapacity, loadFactor, true);
            this.maxCapacity = maxCapacity;
        }

        @Override
        protected boolean removeEldestEntry(java.util.Map.Entry eldest) {
            return size() >= this.maxCapacity;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy