
org.dspace.util.SolrUpgradePre6xStatistics Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
The newest version!
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.util;
import java.io.IOException;
import java.sql.SQLException;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.UUID;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.dspace.content.Bitstream;
import org.dspace.content.Community;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.Group;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.eperson.service.EPersonService;
import org.dspace.eperson.service.GroupService;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
/**
* CLI tool to upgrade legacy id references in SOLR statistics to DSpace 6 UUID's.
*
* This command will need to be run iteratively over each statistics shard until all legacy id values have
* been replaced.
*
* If a legacy id cannot be resolved from the database, the id will remain unchanged.
* "field:* AND NOT(field:*-*)" can be used to locate legacy ids
*
* See DS-3602 for the origin of this issue. This code is targeted for inclusion in the DSpace 6.1 release.
*
* Recommendation: for a large repository, run this command with -Xmx2000m if possible.
*
* To process 1,000,000 statistics records, it took 60 min to complete.
*
* @author Terry Brady, Georgetown University Library
*/
public class SolrUpgradePre6xStatistics {
//Command line parameter constants
private static final String INDEX_NAME_OPTION = "i";
private static final String NUMREC_OPTION = "n";
private static final String BATCH_OPTION = "b";
private static final String TYPE_OPTION = "t";
private static final String HELP_OPTION = "h";
private static final int NUMREC_DEFAULT = 100000;
private static final int BATCH_DEFAULT = 10000;
//After processing each batch of updates to SOLR, evaluate if the hibernate cache needs to be cleared
private static final int CACHE_LIMIT = 20000;
private static final String INDEX_DEFAULT = "statistics";
private static final String MIGQUERY =
"(id:* AND -(id:*-*)) OR (scopeId:* AND -(scopeId:*-*)) OR (epersonid:* AND -(epersonid:*-*))";
//Counters to determine the number of items to process
private int numRec = NUMREC_DEFAULT;
private int batchSize = BATCH_DEFAULT;
//Cache management
private int numProcessed = 0;
private long totalCache = 0;
private long numUncache = 0;
private final List docs = new ArrayList<>();
private Context context;
//Enum to identify the named SOLR statistics fields to update
private enum FIELD {
id,
scopeId,
owningComm,
owningColl,
owningItem,
epersonid,
owner,
submitter,
actor;
}
//Logger
private static final Logger log = LogManager.getLogger();
//DSpace Services
private ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
protected CommunityService communityService = ContentServiceFactory.getInstance().getCommunityService();
protected CollectionService collectionService = ContentServiceFactory.getInstance().getCollectionService();
protected ItemService itemService = ContentServiceFactory.getInstance().getItemService();
protected BitstreamService bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
protected EPersonService epersonService = EPersonServiceFactory.getInstance().getEPersonService();
protected GroupService groupService = EPersonServiceFactory.getInstance().getGroupService();
// This code will operate on one shard at a time, therefore the SOLR web service will be accessed directly rather
// than make use of the DSpace Solr Logger which only writes to the current shard
private final HttpSolrClient server;
//Allows for smart use of hibernate cache
private Item lastItem = null;
private Bitstream lastBitstream = null;
//Report on process times
private long startTime = -1;
private long lastTime = -1;
/**
* Construct the utility class from the command line options
* @param indexName name of the statistics shard to update
* @param numRec maximum number of records to process
* @param batchSize batch this many documents before updating.
* @throws IOException
* @throws SolrServerException
*/
public SolrUpgradePre6xStatistics(String indexName, int numRec, int batchSize)
throws SolrServerException, IOException {
String serverPath = configurationService.getProperty("solr-statistics.server");
serverPath = serverPath.replaceAll("statistics$", indexName);
System.out.println("Connecting to " + serverPath);
server = new HttpSolrClient.Builder(serverPath)
.build();
this.numRec = numRec;
this.batchSize = batchSize;
refreshContext();
}
/*
* Process a batch of updates to SOLR
*/
private void batchUpdateStats() throws SolrServerException, IOException {
if (docs.size() > 0) {
server.add(docs);
server.commit(true, true);
docs.clear();
}
}
/**
* Refresh the DSpace Context object in order to periodically release objects from memory
* @throws IOException
* @throws SolrServerException
*/
private void refreshContext() throws SolrServerException, IOException {
if (context != null) {
try {
totalCache += numUncache + context.getCacheSize();
} catch (SQLException e) {
log.warn(e.getMessage());
}
}
this.context = new Context(Context.Mode.READ_ONLY);
lastItem = null;
lastBitstream = null;
numUncache = 0;
}
/*
* Compute the number of items that were cached by hibernate since the context was cleared.
*/
private long getCacheCounts(boolean fromStart) {
long count = 0;
try {
count = context.getCacheSize();
} catch (SQLException e) {
//no action
}
count += this.numUncache;
if (fromStart) {
count += totalCache;
}
return count;
}
/**
* Compute the time since the last batch was processed
*
* @param fromStart
* if true, report on processing time since the start of the program
* @return the time in ms since the start time
*/
private long logTime(boolean fromStart) {
long ret = 0;
long cur = Instant.now().toEpochMilli();
if (lastTime == -1) {
startTime = cur;
} else if (fromStart) {
ret = cur - startTime;
} else {
ret = cur - lastTime;
}
lastTime = cur;
return ret;
}
/*
* Format ms count as h:mm:ss
*
* @param dur Duration in ms
*
* @return duration formatted as h:mm:ss
*/
private String duration(long dur) {
long sec = dur / 1000;
long hh = sec / 3600;
long mm = (sec % 3600) / 60;
long ss = (sec % 60);
return String.format("%d:%02d:%02d", hh, mm, ss);
}
/**
* Print a status message appended with the processing time for the operation
*
* @param numProcessed
* count of records processed so far.
* @param fromStart
* if true, report on processing time since the start of the program
*/
private void printTime(int numProcessed, boolean fromStart) {
long dur = logTime(fromStart);
long totalDur = logTime(true);
String stotalDur = duration(totalDur);
long cacheSize = 0;
try {
cacheSize = context.getCacheSize();
} catch (SQLException e) {
log.error("Cannot get cache size", e);
}
String label = fromStart ? "TOTAL" : "Processed";
System.out.println(String.format("%s (%s; %s; %s)",
String.format("\t%,12d %10s...", numProcessed, label),
String.format("%,6d sec; %s", dur / 1000, stotalDur),
String.format("DB cache: %,6d/%,8d", cacheSize, getCacheCounts(fromStart)),
String.format("Docs: %,6d", docs.size())));
}
/*
* Create command line option processor
*/
private static Options makeOptions() {
Options options = new Options();
options.addOption(HELP_OPTION, "help", false, "Get help on options for this command.");
options.addOption(INDEX_NAME_OPTION, "index-name", true,
"The names of the indexes to process. At least one is required (default=statistics)");
options.addOption(NUMREC_OPTION, "num-rec", true, "Total number of records to update (default=100,000).");
options.addOption(BATCH_OPTION, "batch-size", true,
"Number of records to batch update to SOLR at one time (default=10,000).");
return options;
}
/**
* A utility method to print out all available command-line options and exit
* given the specified code.
*
* @param options
* the supported options.
* @param exitCode
* the exit code to use. The method will call System#exit(int) with
* the given code.
*/
private static void printHelpAndExit(Options options, int exitCode) {
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp(SolrUpgradePre6xStatistics.class.getSimpleName() + "\n", options);
System.out.println("\n\nCommand Defaults");
System.out.println(
"\tsolr-upgrade-statistics-6x [-i statistics] [-n num_recs_to_process] [-b num_rec_to_update_at_once]");
System.out.println("");
System.out.println(
"\tAfter upgrading to DSpace 6, this process should be run iteratively over every statistics shard ");
System.out.println("\t\tuntil there are no remaining records with legacy ids present.");
System.out.println("\t\tThis process can be run while the system is in use.");
System.out.println("");
System.out.println("\tIt will take 20-30 min to process 1,000,000 legacy records. ");
System.out.println("");
System.out.println("\tUse the -n option to manage the workload on your server. ");
System.out.println("\t\tTo process all records, set -n to 10000000 or to 100000000 (10M or 100M)");
System.out.println("\tIf possible, please allocate 2GB of memory to this process (e.g. -Xmx2000m)");
System.out.println("");
System.out.println("\tThis process will rewrite most solr statistics records and may temporarily double ");
System.out.println(
"\t\tthe size of your statistics repositories. Consider optimizing your solr repos when complete.");
System.exit(exitCode);
}
/**
* Entry point for command-line invocation
*
* @param args
* command-line arguments; see help for description
* @throws ParseException
* if the command-line arguments cannot be parsed
*/
public static void main(String[] args) throws ParseException {
CommandLineParser parser = new DefaultParser();
Options options = makeOptions();
System.out.println(" * This process should be run iteratively over every statistics shard ");
System.out.println(" * until there are no remaining records with legacy ids present.");
System.out.println(" * This process can be run while the system is in use.");
System.out.println(" * It is likely to take 1 hour/1,000,000 legacy records to be updated.");
System.out.println(" *");
System.out.println(" * This process will rewrite most solr statistics records and may temporarily double ");
System.out.println(
" *\tthe size of your statistics repositories. Consider optimizing your solr repos when complete.");
System.out.println(" * -------------------------------------------------------------------");
String indexName = INDEX_DEFAULT;
int numrec = NUMREC_DEFAULT;
int batchSize = BATCH_DEFAULT;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption(HELP_OPTION)) {
printHelpAndExit(options, 0);
}
if (line.hasOption(INDEX_NAME_OPTION)) {
indexName = line.getOptionValue(INDEX_NAME_OPTION, INDEX_DEFAULT);
} else {
System.err.println("No index name provided, defaulting to : " + INDEX_DEFAULT);
}
if (line.hasOption(NUMREC_OPTION)) {
numrec = Integer.parseInt(line.getOptionValue(NUMREC_OPTION, "" + NUMREC_DEFAULT));
}
if (line.hasOption(BATCH_OPTION)) {
batchSize = Integer.parseInt(line.getOptionValue(BATCH_OPTION, "" + BATCH_DEFAULT));
}
} catch (ParseException e) {
System.err.println("Cannot read command options");
printHelpAndExit(options, 1);
}
try {
SolrUpgradePre6xStatistics upgradeStats = new SolrUpgradePre6xStatistics(indexName, numrec, batchSize);
upgradeStats.run();
} catch (SolrServerException | SQLException | IOException e) {
log.error("Error querying stats", e);
}
}
/*
* Report on the existence of legacy id records within a shard
*/
private void runReport() throws SolrServerException, IOException {
System.out.println();
System.out.println("=================================================================");
System.out.println("\t*** Statistics Records with Legacy Id ***\n");
long total = runReportQuery();
System.out.println("\t--------------------------------------");
System.out.println(String.format("\t%,12d\t%s", total, "TOTAL"));
System.out.println("=================================================================");
System.out.println();
}
/*
* Report on the existence of specific legacy id records within a shard
*/
private long runReportQuery() throws SolrServerException, IOException {
SolrQuery sQ = new SolrQuery();
sQ.setQuery(MIGQUERY);
sQ.setFacet(true);
sQ.addFacetField("type");
sQ.addFacetField("scopeType");
QueryResponse sr = server.query(sQ);
long total = 0;
long unexpected = 0;
for (FacetField ff : sr.getFacetFields()) {
String s = ff.getName().equals("type") ? "View" : "Search";
for (FacetField.Count count : ff.getValues()) {
String name = count.getName();
int id = Integer.parseInt(name);
if (id == Constants.COMMUNITY) {
name = "Community " + s;
} else if (id == Constants.COLLECTION) {
name = "Collection " + s;
} else if (id == Constants.ITEM) {
name = "Item " + s;
} else if (id == Constants.BITSTREAM) {
name = "Bitstream " + s;
} else {
/*
* In testing, I discovered some unexpected values in the scopeType field. It
* looks like they may have been a result of a CSV import/export error. This
* will group any unexpected values into one report line.
*/
unexpected += count.getCount();
continue;
}
System.out.println(String.format("\t%,12d\t%s", count.getCount(), name));
total += count.getCount();
}
}
if (unexpected > 0) {
System.out.println(String.format("\t%,12d\t%s", unexpected, "Unexpected Type & Full Site"));
total += unexpected;
}
long rem = sr.getResults().getNumFound() - total;
if (rem > 0) {
System.out.println(String.format("\t%,12d\t%s", rem, "Other Records"));
total += rem;
}
return total;
}
/*
* Process records with a legacy id. From the command line, the user may specify
* records of a specific type to update Otherwise, the following sequence will
* be applied in order to optimize hibernate caching.
*
* Communities and Collections - retain in the cache since each is likely to be
* re-used Items - retain in the cache until a new item is processed Bitstreams
* - retain in the cache until a new bitstream is processed
*/
private void run() throws SolrServerException, SQLException, IOException {
runReport();
logTime(false);
for (int processed = updateRecords(MIGQUERY); (processed != 0)
&& (numProcessed <= numRec); processed = updateRecords(MIGQUERY)) {
printTime(numProcessed, false);
batchUpdateStats();
if (context.getCacheSize() > CACHE_LIMIT) {
refreshContext();
}
}
printTime(numProcessed, true);
if (numProcessed > 0) {
runReport();
}
}
/*
* Update records associated with a particular object id
*
* @param query Query to retrieve all of the statistics records associated with
* a particular object
*
* @param field Field to use for grouping records
*
* @return number of items processed. 0 indicates that no more work is available
* (or the max processed has been reached).
*/
private int updateRecords(String query) throws SolrServerException, SQLException, IOException {
int initNumProcessed = numProcessed;
SolrQuery sQ = new SolrQuery();
sQ.setQuery(query);
sQ.setRows(batchSize);
// Ensure that items are grouped by id
// Sort by id fails due to presence of id and string fields. The ord function
// seems to help
sQ.addSort("type", SolrQuery.ORDER.desc);
sQ.addSort("scopeType", SolrQuery.ORDER.desc);
sQ.addSort("ord(owningItem)", SolrQuery.ORDER.desc);
sQ.addSort("ord(id)", SolrQuery.ORDER.asc);
sQ.addSort("ord(scopeId)", SolrQuery.ORDER.asc);
QueryResponse sr = server.query(sQ);
SolrDocumentList sdl = sr.getResults();
for (int i = 0; i < sdl.size() && (numProcessed < numRec); i++) {
SolrDocument sd = sdl.get(i);
SolrInputDocument input = new SolrInputDocument(); //ClientUtils.toSolrInputDocument(sd);
for (String name : sd.getFieldNames()) { // https://stackoverflow.com/a/38536843/2916377
input.addField(name, sd.getFieldValue(name));
}
input.remove("_version_");
for (FIELD col : FIELD.values()) {
mapField(input, col);
}
docs.add(input);
++numProcessed;
}
return numProcessed - initNumProcessed;
}
/*
* Map solr fields from legacy ids to UUIDs.
*
* The id field is interpreted by the type field. The scopeId field is
* interpreted by scopeType field.
*
* Legacy ids will be unchanged if they cannot be mapped
*
* @param input The SOLR statistics document to be updated
*
* @param col The SOLR field to update (if present)
*/
private void mapField(SolrInputDocument input, FIELD col) throws SQLException {
SolrInputField ifield = input.get(col.name());
if (ifield != null) {
Collection
© 2015 - 2025 Weber Informatics LLC | Privacy Policy