
org.dspace.statistics.util.StatisticsImporter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
The newest version!
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.statistics.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.InetAddress;
import java.sql.SQLException;
import java.text.DecimalFormat;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Random;
import java.util.UUID;
import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.exception.GeoIp2Exception;
import com.maxmind.geoip2.model.CityResponse;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
import org.dspace.content.Bitstream;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.DSpaceObjectLegacySupportService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.statistics.factory.StatisticsServiceFactory;
import org.dspace.statistics.service.SolrLoggerService;
/**
* Class to load intermediate statistics files (produced from log files by {@link ClassicDSpaceLogConverter}) into Solr.
*
* @author Stuart Lewis
* @see ClassicDSpaceLogConverter
*/
public class StatisticsImporter {
private static final Logger log = LogManager.getLogger(StatisticsImporter.class);
protected final SolrLoggerService solrLoggerService
= StatisticsServiceFactory.getInstance().getSolrLoggerService();
protected static final ConfigurationService configurationService
= DSpaceServicesFactory.getInstance().getConfigurationService();
/**
* Solr server connection
*/
private static HttpSolrClient solr;
/**
* GEOIP lookup service
*/
private static DatabaseReader geoipLookup;
/**
* Whether to skip the DNS reverse lookup or not
*/
private static boolean skipReverseDNS = false;
/**
* Local items
*/
private List localItems;
/**
* Local collections
*/
private List localCollections;
/**
* Local communities
*/
private List localCommunities;
/**
* Local bitstreams
*/
private List localBitstreams;
/**
* Whether or not to replace item IDs with local values (for testing)
*/
private final boolean useLocal;
protected final BitstreamService bitstreamService;
protected final CollectionService collectionService;
protected final CommunityService communityService;
protected final ItemService itemService;
/**
* Constructor. Optionally loads local data to replace foreign data
* if using someone else's log files
*
* @param local Whether to use local data
*/
public StatisticsImporter(boolean local) {
bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
collectionService = ContentServiceFactory.getInstance().getCollectionService();
communityService = ContentServiceFactory.getInstance().getCommunityService();
itemService = ContentServiceFactory.getInstance().getItemService();
// Setup the lists of communities, collections, items & bitstreams if required
useLocal = local;
if (local) {
try {
ContentServiceFactory contentServiceFactory = ContentServiceFactory.getInstance();
System.out.print("Loading local communities... ");
Context c = new Context();
List communities = communityService.findAll(c);
localCommunities = new ArrayList<>();
for (Community community : communities) {
localCommunities.add(community.getID());
}
System.out.println("Found " + localCommunities.size());
System.out.print("Loading local collections... ");
List collections = collectionService.findAll(c);
localCollections = new ArrayList<>();
for (Collection collection : collections) {
localCollections.add(collection.getID());
}
System.out.println("Found " + localCollections.size());
System.out.print("Loading local items... ");
Iterator- items = itemService.findAll(c);
localItems = new ArrayList<>();
Item i;
while (items.hasNext()) {
i = items.next();
localItems.add(i.getID());
}
System.out.println("Found " + localItems.size());
System.out.print("Loading local bitstreams... ");
List
bitstreams = bitstreamService.findAll(c);
localBitstreams = new ArrayList<>();
for (Bitstream bitstream : bitstreams) {
if (bitstream.getName() != null) {
localBitstreams.add(bitstream.getID());
}
}
System.out.println("Found " + localBitstreams.size());
} catch (SQLException e) {
System.err.println("Error retrieving items from DSpace database:");
e.printStackTrace();
System.exit(1);
}
}
}
/**
* Read lines from the statistics file and load their data into solr.
*
* @param filename The filename of the file to load
* @param context The DSpace Context
* @param verbose Whether to display verbose output
*/
protected void load(String filename, Context context, boolean verbose) {
// Item counter
int counter = 0;
int errors = 0;
int searchengines = 0;
try {
BufferedReader input;
if (null == filename || "-".equals(filename)) {
input = new BufferedReader(new InputStreamReader(System.in));
filename = "standard input";
} else {
input = new BufferedReader(new FileReader(new File(filename)));
}
// Print out the filename for confirmation
System.out.println("Processing file: " + filename);
String line;
// String uuid;
String action;
String id;
Instant date;
String user;
String ip;
String continent = "";
String country = "";
String countryCode = "";
double longitude = 0f;
double latitude = 0f;
String city = "";
String dns;
DNSCache dnsCache = new DNSCache(2500, 0.75f, 2500);
Object fromCache;
Random rand = new Random();
while ((line = input.readLine()) != null) {
// Tokenise the line
String data = "";
counter++;
errors++;
if (verbose) {
System.out.println("Line:" + line);
}
String[] parts = line.split(",");
// uuid = parts[0];
action = parts[1];
id = parts[2];
// Date format (for solr)
date = LocalDateTime.parse(parts[3]).toInstant(ZoneOffset.UTC);
user = parts[4];
ip = parts[5];
// Resolve the dns (if applicable) to get rid of search engine bots early on in the processing chain
dns = "";
if (!skipReverseDNS) {
// Is the IP address in the cache?
fromCache = dnsCache.get(ip);
if (fromCache != null) {
dns = (String) fromCache;
} else {
try {
dns = DnsLookup.reverseDns(ip);
dnsCache.put(ip, dns);
} catch (IOException e) {
dns = "";
}
}
}
data += ("ip addr = " + ip);
data += (", dns name = " + dns);
if ((dns.endsWith(".googlebot.com.")) ||
(dns.endsWith(".crawl.yahoo.net.")) ||
(dns.endsWith(".search.msn.com."))) {
if (verbose) {
System.out.println(data + ", IGNORE (search engine)");
}
errors--;
searchengines++;
continue;
}
// Get the geo information for the user
try {
InetAddress ipAddress = InetAddress.getByName(ip);
CityResponse cityResponse = geoipLookup.city(ipAddress);
city = cityResponse.getCity().getName();
country = cityResponse.getCountry().getName();
countryCode = cityResponse.getCountry().getIsoCode();
longitude = cityResponse.getLocation().getLongitude();
latitude = cityResponse.getLocation().getLatitude();
if (verbose) {
data += (", country = " + country);
data += (", city = " + city);
System.out.println(data);
}
try {
continent = LocationUtils.getContinentCode(countryCode);
} catch (Exception e) {
if (verbose) {
System.out.println("Unknown country code: " + countryCode);
}
continue;
}
} catch (GeoIp2Exception | IOException e) {
// No problem - just can't look them up
}
// Now find our dso
ContentServiceFactory contentServiceFactory = ContentServiceFactory.getInstance();
DSpaceObjectLegacySupportService legacySupportService = null;
if ("view_bitstream".equals(action)) {
legacySupportService = contentServiceFactory.getBitstreamService();
if (useLocal) {
id = "" + localBitstreams.get(rand.nextInt(localBitstreams.size()));
}
} else if ("view_item".equals(action)) {
legacySupportService = contentServiceFactory.getItemService();
if (useLocal) {
id = "" + localItems.get(rand.nextInt(localItems.size()));
}
} else if ("view_collection".equals(action)) {
legacySupportService = contentServiceFactory.getCollectionService();
if (useLocal) {
id = "" + localCollections.get(rand.nextInt(localCollections.size()));
}
} else if ("view_community".equals(action)) {
legacySupportService = contentServiceFactory.getCommunityService();
if (useLocal) {
id = "" + localCommunities.get(rand.nextInt(localCommunities.size()));
}
}
if (legacySupportService == null) {
continue;
}
DSpaceObject dso = legacySupportService.findByIdOrLegacyId(context, id);
if (dso == null) {
if (verbose) {
System.err.println(" - DSO with ID '" + id + "' is no longer in the system");
}
continue;
}
// Get the eperson details
EPerson eperson = EPersonServiceFactory.getInstance().getEPersonService().findByEmail(context, user);
UUID epersonId = null;
if (eperson != null) {
epersonId = eperson.getID();
}
// Save it in our server
SolrInputDocument sid = new SolrInputDocument();
sid.addField("ip", ip);
sid.addField("type", dso.getType());
sid.addField("id", dso.getID().toString());
sid.addField("time", date.toString());
sid.addField("continent", continent);
sid.addField("country", country);
sid.addField("countryCode", countryCode);
sid.addField("city", city);
sid.addField("latitude", latitude);
sid.addField("longitude", longitude);
if (epersonId != null) {
sid.addField("epersonid", epersonId);
}
sid.addField("dns", dns.toLowerCase());
solrLoggerService.storeParents(sid, dso);
solr.add(sid);
errors--;
}
} catch (RuntimeException re) {
throw re;
} catch (IOException | SQLException | SolrServerException e) {
System.err.println(e.getMessage());
log.error(e.getMessage(), e);
}
DecimalFormat percentage = new DecimalFormat("##.###");
int committed = counter - errors - searchengines;
System.out.println("Processed " + counter + " log lines");
if (counter > 0) {
Double committedpercentage = 100d * committed / counter;
System.out
.println(" - " + committed + " entries added to solr: " + percentage.format(committedpercentage) + "%");
Double errorpercentage = 100d * errors / counter;
System.out.println(" - " + errors + " errors: " + percentage.format(errorpercentage) + "%");
Double sepercentage = 100d * searchengines / counter;
System.out.println(
" - " + searchengines + " search engine activity skipped: " + percentage.format(sepercentage) + "%");
System.out.print("About to commit data to solr...");
// Commit at the end because it takes a while
try {
solr.commit();
} catch (SolrServerException sse) {
System.err.println("Error committing statistics to solr server!");
sse.printStackTrace();
System.exit(1);
} catch (IOException ioe) {
System.err.println("Error writing to solr server!");
ioe.printStackTrace();
System.exit(1);
}
}
System.out.println(" done!");
}
/**
* Print the help message
*
* @param options The command line options the user gave
* @param exitCode the system exit code to use
*/
private static void printHelp(Options options, int exitCode) {
// print the help message
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("StatisticsImporter\n", options);
System.exit(exitCode);
}
/**
* Main method to run the statistics importer.
*
* @param args the command line arguments given
* @throws Exception If something goes wrong
*/
public static void main(String[] args) throws Exception {
CommandLineParser parser = new DefaultParser();
Options options = new Options();
options.addOption("i", "in", true,
"the input file ('-' or omit for standard input)");
options.addOption("l", "local", false,
"developers tool - map external log file to local handles");
options.addOption("m", "multiple", false,
"treat the input file as having a wildcard ending");
options.addOption("s", "skipdns", false,
"skip performing reverse DNS lookups on IP addresses");
options.addOption("v", "verbose", false,
"display verbose output (useful for debugging)");
options.addOption("h", "help", false,
"help");
CommandLine line = parser.parse(options, args);
// Did the user ask to see the help?
if (line.hasOption('h')) {
printHelp(options, 0);
}
if (line.hasOption('s')) {
skipReverseDNS = true;
}
// Whether or not to convert handles to handles used in a local system
// (useful if using someone else's log file for testing)
boolean local = line.hasOption('l');
// We got all our parameters now get the rest
Context context = new Context();
// Verbose option
boolean verbose = line.hasOption('v');
// Find our solr server
String sserver = configurationService.getProperty("solr-statistics.server");
if (verbose) {
System.out.println("Writing to solr server at: " + sserver);
}
solr = new HttpSolrClient.Builder(sserver).build();
String dbPath = configurationService.getProperty("usage-statistics.dbfile");
try {
File dbFile = new File(dbPath);
geoipLookup = new DatabaseReader.Builder(dbFile).build();
} catch (FileNotFoundException fe) {
log.error(
"The GeoLite Database file is missing (" + dbPath + ")! Solr Statistics cannot generate location " +
"based reports! Please see the DSpace installation instructions for instructions to install this " +
"file.",
fe);
} catch (IOException e) {
log.error(
"Unable to load GeoLite Database file (" + dbPath + ")! You may need to reinstall it. See the DSpace " +
"installation instructions for more details.",
e);
} catch (NullPointerException e) {
log.error(
"The value of the property usage-statistics.dbfile is null. You may need to install the GeoLite " +
"Database file and/or uncomment the property in the config file!",
e);
}
StatisticsImporter si = new StatisticsImporter(local);
if (line.hasOption('m')) {
// Convert all the files
final File sample = new File(line.getOptionValue('i'));
File dir = sample.getParentFile();
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(sample.getName());
}
};
String[] children = dir.list(filter);
for (String in : children) {
System.out.println(in);
si.load(dir.getAbsolutePath() + System.getProperty("file.separator") + in, context, verbose);
}
} else {
// Just convert the one file
si.load(line.getOptionValue('i'), context, verbose);
}
}
/**
* Inner class to hold a cache of reverse lookups of IP addresses
*
* @param key type.
* @param value type.
*/
static class DNSCache extends LinkedHashMap {
private final int maxCapacity;
public DNSCache(int initialCapacity, float loadFactor, int maxCapacity) {
super(initialCapacity, loadFactor, true);
this.maxCapacity = maxCapacity;
}
@Override
protected boolean removeEldestEntry(java.util.Map.Entry eldest) {
return size() >= this.maxCapacity;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy