org.dspace.util.SolrUpgradePre6xStatistics Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
The newest version!
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.util;

import java.io.IOException;
import java.sql.SQLException;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.UUID;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.dspace.content.Bitstream;
import org.dspace.content.Community;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.eperson.EPerson;
import org.dspace.eperson.Group;
import org.dspace.eperson.factory.EPersonServiceFactory;
import org.dspace.eperson.service.EPersonService;
import org.dspace.eperson.service.GroupService;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;

/**
 * CLI tool to upgrade legacy id references in SOLR statistics to DSpace 6 UUID's.
 *
 * This command will need to be run iteratively over each statistics shard until all legacy id values have
 * been replaced.
 *
 * If a legacy id cannot be resolved from the database, the id will remain unchanged.
 *   "field:* AND NOT(field:*-*)" can be used to locate legacy ids
 *
 * See DS-3602 for the origin of this issue.  This code is targeted for inclusion in the DSpace 6.1 release.
 *
 * Recommendation: for a large repository, run this command with -Xmx2000m if possible.
 *
 * To process 1,000,000 statistics records, it took 60 min to complete.
 *
 * @author Terry Brady, Georgetown University Library
 */

public class SolrUpgradePre6xStatistics {
    //Command line parameter constants
    private static final String INDEX_NAME_OPTION = "i";
    private static final String NUMREC_OPTION = "n";
    private static final String BATCH_OPTION = "b";
    private static final String TYPE_OPTION = "t";
    private static final String HELP_OPTION = "h";
    private static final int    NUMREC_DEFAULT = 100000;
    private static final int    BATCH_DEFAULT = 10000;

    //After processing each batch of updates to SOLR, evaluate if the hibernate cache needs to be cleared
    private static final int    CACHE_LIMIT = 20000;

    private static final String INDEX_DEFAULT = "statistics";
    private static final String MIGQUERY =
        "(id:* AND -(id:*-*)) OR (scopeId:* AND -(scopeId:*-*)) OR (epersonid:* AND -(epersonid:*-*))";

    //Counters to determine the number of items to process
    private int numRec = NUMREC_DEFAULT;
    private int batchSize = BATCH_DEFAULT;

    //Cache management
    private int numProcessed = 0;
    private long totalCache = 0;
    private long numUncache = 0;
    private final List docs = new ArrayList<>();
    private Context context;

    //Enum to identify the named SOLR statistics fields to update
    private enum FIELD {
        id,
        scopeId,
        owningComm,
        owningColl,
        owningItem,
        epersonid,
        owner,
        submitter,
        actor;
    }

    //Logger
    private static final Logger log = LogManager.getLogger();

    //DSpace Services
    private ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
    protected CommunityService   communityService     = ContentServiceFactory.getInstance().getCommunityService();
    protected CollectionService  collectionService    = ContentServiceFactory.getInstance().getCollectionService();
    protected ItemService        itemService          = ContentServiceFactory.getInstance().getItemService();
    protected BitstreamService   bitstreamService     = ContentServiceFactory.getInstance().getBitstreamService();
    protected EPersonService     epersonService       = EPersonServiceFactory.getInstance().getEPersonService();
    protected GroupService       groupService         = EPersonServiceFactory.getInstance().getGroupService();

    // This code will operate on one shard at a time, therefore the SOLR web service will be accessed directly rather
    // than make use of the DSpace Solr Logger which only writes to the current shard
    private final HttpSolrClient server;

    //Allows for smart use of hibernate cache
    private Item lastItem = null;
    private Bitstream lastBitstream = null;

    //Report on process times
    private long startTime = -1;
    private long lastTime = -1;


    /**
     * Construct the utility class from the command line options
     * @param indexName name of the statistics shard to update
     * @param numRec    maximum number of records to process
     * @param batchSize batch this many documents before updating.
     * @throws IOException
     * @throws SolrServerException
     */
    public SolrUpgradePre6xStatistics(String indexName, int numRec, int batchSize)
            throws SolrServerException, IOException {
        String serverPath = configurationService.getProperty("solr-statistics.server");
        serverPath = serverPath.replaceAll("statistics$", indexName);
        System.out.println("Connecting to " + serverPath);
        server = new HttpSolrClient.Builder(serverPath)
                .build();
        this.numRec = numRec;
        this.batchSize = batchSize;
        refreshContext();
    }

    /*
     * Process a batch of updates to SOLR
     */
    private void batchUpdateStats() throws SolrServerException, IOException {
        if (docs.size() > 0) {
            server.add(docs);
            server.commit(true, true);
            docs.clear();
        }
    }

    /**
     * Refresh the DSpace Context object in order to periodically release objects from memory
     * @throws IOException
     * @throws SolrServerException
     */
    private void refreshContext() throws SolrServerException, IOException {
        if (context != null) {
            try {
                totalCache += numUncache + context.getCacheSize();
            } catch (SQLException e) {
                log.warn(e.getMessage());
            }
        }
        this.context = new Context(Context.Mode.READ_ONLY);
        lastItem = null;
        lastBitstream = null;
        numUncache = 0;
    }

    /*
     * Compute the number of items that were cached by hibernate since the context was cleared.
     */
    private long getCacheCounts(boolean fromStart) {
        long count = 0;
        try {
            count = context.getCacheSize();
        } catch (SQLException e) {
            //no action
        }
        count += this.numUncache;
        if (fromStart) {
            count += totalCache;
        }
        return count;
    }

    /**
     * Compute the time since the last batch was processed
     *
     * @param fromStart
     *            if true, report on processing time since the start of the program
     * @return the time in ms since the start time
     */
    private long logTime(boolean fromStart) {
        long ret = 0;
        long cur = Instant.now().toEpochMilli();
        if (lastTime == -1) {
            startTime = cur;
        } else if (fromStart) {
            ret = cur - startTime;
        } else {
            ret = cur - lastTime;
        }
        lastTime = cur;
        return ret;
    }

    /*
     * Format ms count as h:mm:ss
     *
     * @param dur Duration in ms
     *
     * @return duration formatted as h:mm:ss
     */
    private String duration(long dur) {
        long sec = dur / 1000;
        long hh = sec / 3600;
        long mm = (sec % 3600) / 60;
        long ss = (sec % 60);
        return String.format("%d:%02d:%02d", hh, mm, ss);
    }

    /**
     * Print a status message appended with the processing time for the operation
     *
     * @param numProcessed
     *            count of records processed so far.
     * @param fromStart
     *            if true, report on processing time since the start of the program
     */
    private void printTime(int numProcessed, boolean fromStart) {
        long dur = logTime(fromStart);
        long totalDur = logTime(true);
        String stotalDur = duration(totalDur);
        long cacheSize = 0;
        try {
            cacheSize = context.getCacheSize();
        } catch (SQLException e) {
            log.error("Cannot get cache size", e);
        }
        String label = fromStart ? "TOTAL" : "Processed";
        System.out.println(String.format("%s (%s; %s; %s)",
            String.format("\t%,12d %10s...", numProcessed, label),
            String.format("%,6d sec; %s", dur / 1000, stotalDur),
            String.format("DB cache: %,6d/%,8d", cacheSize, getCacheCounts(fromStart)),
            String.format("Docs: %,6d", docs.size())));
    }

    /*
     * Create command line option processor
     */
    private static Options makeOptions() {
        Options options = new Options();
        options.addOption(HELP_OPTION, "help", false, "Get help on options for this command.");
        options.addOption(INDEX_NAME_OPTION, "index-name", true,
                "The names of the indexes to process. At least one is required (default=statistics)");
        options.addOption(NUMREC_OPTION, "num-rec", true, "Total number of records to update (default=100,000).");
        options.addOption(BATCH_OPTION, "batch-size", true,
                "Number of records to batch update to SOLR at one time (default=10,000).");
        return options;
    }

    /**
     * A utility method to print out all available command-line options and exit
     * given the specified code.
     *
     * @param options
     *            the supported options.
     * @param exitCode
     *            the exit code to use. The method will call System#exit(int) with
     *            the given code.
     */
    private static void printHelpAndExit(Options options, int exitCode) {
        HelpFormatter myhelp = new HelpFormatter();
        myhelp.printHelp(SolrUpgradePre6xStatistics.class.getSimpleName() + "\n", options);
        System.out.println("\n\nCommand Defaults");
        System.out.println(
                "\tsolr-upgrade-statistics-6x [-i statistics] [-n num_recs_to_process] [-b num_rec_to_update_at_once]");
        System.out.println("");
        System.out.println(
                "\tAfter upgrading to DSpace 6, this process should be run iteratively over every statistics shard ");
        System.out.println("\t\tuntil there are no remaining records with legacy ids present.");
        System.out.println("\t\tThis process can be run while the system is in use.");
        System.out.println("");
        System.out.println("\tIt will take 20-30 min to process 1,000,000 legacy records. ");
        System.out.println("");
        System.out.println("\tUse the -n option to manage the workload on your server. ");
        System.out.println("\t\tTo process all records, set -n to 10000000 or to 100000000 (10M or 100M)");
        System.out.println("\tIf possible, please allocate 2GB of memory to this process (e.g. -Xmx2000m)");
        System.out.println("");
        System.out.println("\tThis process will rewrite most solr statistics records and may temporarily double ");
        System.out.println(
                "\t\tthe size of your statistics repositories.  Consider optimizing your solr repos when complete.");

        System.exit(exitCode);
    }

    /**
     * Entry point for command-line invocation
     *
     * @param args
     *            command-line arguments; see help for description
     * @throws ParseException
     *             if the command-line arguments cannot be parsed
     */
    public static void main(String[] args) throws ParseException {
        CommandLineParser parser = new DefaultParser();
        Options options = makeOptions();

        System.out.println(" * This process should be run iteratively over every statistics shard ");
        System.out.println(" * until there are no remaining records with legacy ids present.");
        System.out.println(" * This process can be run while the system is in use.");
        System.out.println(" * It is likely to take 1 hour/1,000,000 legacy records to be updated.");
        System.out.println(" *");
        System.out.println(" * This process will rewrite most solr statistics records and may temporarily double ");
        System.out.println(
                " *\tthe size of your statistics repositories.  Consider optimizing your solr repos when complete.");
        System.out.println(" * -------------------------------------------------------------------");

        String indexName = INDEX_DEFAULT;
        int numrec = NUMREC_DEFAULT;
        int batchSize = BATCH_DEFAULT;
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption(HELP_OPTION)) {
                printHelpAndExit(options, 0);
            }

            if (line.hasOption(INDEX_NAME_OPTION)) {
                indexName = line.getOptionValue(INDEX_NAME_OPTION, INDEX_DEFAULT);
            } else {
                System.err.println("No index name provided, defaulting to : " + INDEX_DEFAULT);
            }

            if (line.hasOption(NUMREC_OPTION)) {
                numrec = Integer.parseInt(line.getOptionValue(NUMREC_OPTION, "" + NUMREC_DEFAULT));
            }
            if (line.hasOption(BATCH_OPTION)) {
                batchSize = Integer.parseInt(line.getOptionValue(BATCH_OPTION, "" + BATCH_DEFAULT));
            }

        } catch (ParseException e) {
            System.err.println("Cannot read command options");
            printHelpAndExit(options, 1);
        }

        try {
            SolrUpgradePre6xStatistics upgradeStats = new SolrUpgradePre6xStatistics(indexName, numrec, batchSize);
            upgradeStats.run();
        } catch (SolrServerException | SQLException | IOException e) {
            log.error("Error querying stats", e);
        }
    }

    /*
     * Report on the existence of legacy id records within a shard
     */
    private void runReport() throws SolrServerException, IOException {
        System.out.println();
        System.out.println("=================================================================");
        System.out.println("\t*** Statistics Records with Legacy Id ***\n");
        long total = runReportQuery();
        System.out.println("\t--------------------------------------");
        System.out.println(String.format("\t%,12d\t%s", total, "TOTAL"));
        System.out.println("=================================================================");
        System.out.println();
    }

    /*
     * Report on the existence of specific legacy id records within a shard
     */
    private long runReportQuery() throws SolrServerException, IOException {
        SolrQuery sQ = new SolrQuery();
        sQ.setQuery(MIGQUERY);
        sQ.setFacet(true);
        sQ.addFacetField("type");
        sQ.addFacetField("scopeType");
        QueryResponse sr = server.query(sQ);

        long total = 0;
        long unexpected = 0;
        for (FacetField ff : sr.getFacetFields()) {
            String s = ff.getName().equals("type") ? "View" : "Search";
            for (FacetField.Count count : ff.getValues()) {
                String name = count.getName();
                int id = Integer.parseInt(name);
                if (id == Constants.COMMUNITY) {
                    name = "Community " + s;
                } else if (id == Constants.COLLECTION) {
                    name = "Collection " + s;
                } else if (id == Constants.ITEM) {
                    name = "Item " + s;
                } else if (id == Constants.BITSTREAM) {
                    name = "Bitstream " + s;
                } else {
                    /*
                     * In testing, I discovered some unexpected values in the scopeType field. It
                     * looks like they may have been a result of a CSV import/export error. This
                     * will group any unexpected values into one report line.
                     */
                    unexpected += count.getCount();
                    continue;
                }
                System.out.println(String.format("\t%,12d\t%s", count.getCount(), name));
                total += count.getCount();
            }
        }
        if (unexpected > 0) {
            System.out.println(String.format("\t%,12d\t%s", unexpected, "Unexpected Type & Full Site"));
            total += unexpected;
        }
        long rem = sr.getResults().getNumFound() - total;
        if (rem > 0) {
            System.out.println(String.format("\t%,12d\t%s", rem, "Other Records"));
            total += rem;
        }
        return total;
    }

    /*
     * Process records with a legacy id. From the command line, the user may specify
     * records of a specific type to update Otherwise, the following sequence will
     * be applied in order to optimize hibernate caching.
     *
     * Communities and Collections - retain in the cache since each is likely to be
     * re-used Items - retain in the cache until a new item is processed Bitstreams
     * - retain in the cache until a new bitstream is processed
     */
    private void run() throws SolrServerException, SQLException, IOException {
        runReport();
        logTime(false);
        for (int processed = updateRecords(MIGQUERY); (processed != 0)
                && (numProcessed <= numRec); processed = updateRecords(MIGQUERY)) {
            printTime(numProcessed, false);
            batchUpdateStats();
            if (context.getCacheSize() > CACHE_LIMIT) {
                refreshContext();
            }
        }
        printTime(numProcessed, true);

        if (numProcessed > 0) {
            runReport();
        }
    }

    /*
     * Update records associated with a particular object id
     *
     * @param query Query to retrieve all of the statistics records associated with
     * a particular object
     *
     * @param field Field to use for grouping records
     *
     * @return number of items processed. 0 indicates that no more work is available
     * (or the max processed has been reached).
     */
    private int updateRecords(String query) throws SolrServerException, SQLException, IOException {
        int initNumProcessed = numProcessed;
        SolrQuery sQ = new SolrQuery();
        sQ.setQuery(query);
        sQ.setRows(batchSize);

        // Ensure that items are grouped by id
        // Sort by id fails due to presence of id and string fields. The ord function
        // seems to help
        sQ.addSort("type", SolrQuery.ORDER.desc);
        sQ.addSort("scopeType", SolrQuery.ORDER.desc);
        sQ.addSort("ord(owningItem)", SolrQuery.ORDER.desc);
        sQ.addSort("ord(id)", SolrQuery.ORDER.asc);
        sQ.addSort("ord(scopeId)", SolrQuery.ORDER.asc);

        QueryResponse sr = server.query(sQ);
        SolrDocumentList sdl = sr.getResults();

        for (int i = 0; i < sdl.size() && (numProcessed < numRec); i++) {
            SolrDocument sd = sdl.get(i);
            SolrInputDocument input = new SolrInputDocument(); //ClientUtils.toSolrInputDocument(sd);
            for (String name : sd.getFieldNames()) { // https://stackoverflow.com/a/38536843/2916377
                input.addField(name, sd.getFieldValue(name));
            }
            input.remove("_version_");
            for (FIELD col : FIELD.values()) {
                mapField(input, col);
            }

            docs.add(input);
            ++numProcessed;
        }
        return numProcessed - initNumProcessed;
    }

    /*
     * Map solr fields from legacy ids to UUIDs.
     *
     * The id field is interpreted by the type field. The scopeId field is
     * interpreted by scopeType field.
     *
     * Legacy ids will be unchanged if they cannot be mapped
     *
     * @param input The SOLR statistics document to be updated
     *
     * @param col The SOLR field to update (if present)
     */
    private void mapField(SolrInputDocument input, FIELD col) throws SQLException {
        SolrInputField ifield = input.get(col.name());
        if (ifield != null) {
            Collection