All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nz.co.gregs.dbvolution.DBExtractor Maven / Gradle / Ivy

/*
 * Copyright 2015 gregory.graham.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nz.co.gregs.dbvolution;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import nz.co.gregs.dbvolution.databases.DBDatabase;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.List;
import nz.co.gregs.dbvolution.actions.DBActionList;
import nz.co.gregs.dbvolution.exceptions.AccidentalBlankQueryException;
import nz.co.gregs.dbvolution.exceptions.AccidentalCartesianJoinException;

/**
 * Extractor to retrieve data from unreliable or resource constrained databases,
 * or from exceptionally large queries.
 *
 * 

* To use Extractor, create a subclass implementing the * {@link #getQuery(nz.co.gregs.dbvolution.databases.DBDatabase, int, int)} and * {@link #processRows(java.util.List)} methods and call {@link #extract() }. *

* The extractor uses primary key ranges to reduce the size of the query to * something the database can handle. The range is increased or decreased * automatically depending on whether database coped with the request well or * not. * *

* All values will be returned, unless the database failed to process a range of * 1 (that is a single row) which case the value is skipped and the Extractor * will continue. * *

* A key feature of using Extractor over other methods is its ability to * accelerate and brake as possible or required to achieve close to optimal * throughput. * *

Support DBvolution at * Patreon

* * @author Gregory Graham */ public abstract class DBExtractor extends DBScript { /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ private int maxBoundIncrease = 10000000; private static final int MIN_BOUND_INCREASE = 1; private int boundIncrease = 10; private int maxBound = 200 * 1000000; private int startLowerBound = 0; private int lowerBound = 0; private boolean moreRecords = true; private double previousTimePerRecord = Double.MAX_VALUE; // ridiculous default is only to seed the process. private Integer timeoutInMilliseconds = 10000; private Long rowCount = null; private boolean countOnly = false; private final DBDatabase database; /** * Default constructor. * * @param db */ public DBExtractor(DBDatabase db) { database = db; } /** * When Extractor has successfully extract some rows, they are handed to this * method for processing. * * @param rows * @throws Exception */ abstract public void processRows(List rows) throws Exception; /** * Using the database and bounds provided, construct the required query. * *

* Extractor does not know the query you want executed so this is the place to * add it. * *

* Choose one important table in your query and add the lower- and * upper-bounds provided to the primary key as a permitted range:
* {@code employee.employeeID.permittedRange(lowerbound, upperbound);} * *

* Add the table to your query and return it to the extractor process:
* {@code return db.getDBQuery(employee);} * *

* The rows found by the Extractor will be sent to * {@link #processRows(java.util.List)} . * * @param db * @param lowerbound * @param upperbound *

Support DBvolution at * Patreon

* @return a DBQuery for the database and range */ abstract public DBQuery getQuery(DBDatabase db, int lowerbound, int upperbound); private DBDatabase getDatabase() { return database; } /** * Starts the extraction process. * *

* Call this method in your Extractor subclass to start extracting rows from * the database and processing them. * *

* Works in conjuction with the * {@link #getQuery(nz.co.gregs.dbvolution.databases.DBDatabase, int, int)} * and {@link #processRows(java.util.List)} method to provide a dynamic * extraction process that achieves fast results on unreliable or * under-resourced databases. * *

Support DBvolution at * Patreon

* * @return @throws Exception */ public final DBActionList extract() throws Exception { DBActionList actions = new DBActionList(); DBDatabase db = getDatabase(); startLowerBound = lowerBound; Date startTime = new Date(); GregorianCalendar cal = new GregorianCalendar(); while (hasMoreRecords()) { actions.addAll(db.test(this)); Date finishTime = new Date(); final double elapsedTimeInMilliseconds = 0.0 + finishTime.getTime() - startTime.getTime(); double timePerRecord = elapsedTimeInMilliseconds / (lowerBound - startLowerBound); System.out.println("EXTRACTED: " + getLowerBound() + "-" + getUpperBound() + " (+" + getBoundIncrease() + ") in " + elapsedTimeInMilliseconds + "ms at " + timePerRecord + "ms/record."); double estimatedRequiredTime = timePerRecord * (maxBound - startLowerBound); cal.setTime(startTime); int secondsValue = (new Double(estimatedRequiredTime / 1000.0D)).intValue(); cal.add(Calendar.SECOND, secondsValue); double timeInHours = (Math.round((estimatedRequiredTime / (1000 * 60 * 60)) * 100) + 0.0) / 100.0; double timeInMinutes = (Math.round((estimatedRequiredTime / (1000 * 60)) * 100) + 0.0) / 100.0; double elapsedTimeInMinutes = (Math.round((elapsedTimeInMilliseconds / (1000 * 60)) * 100) + 0.0) / 100.0; double elapsedTimeInHours = (Math.round((elapsedTimeInMilliseconds / (1000 * 60 * 60)) * 100) + 0.0) / 100.0; double remainingTimeInMinutes = timeInMinutes - elapsedTimeInMinutes; double remainingTimeInHours = timeInHours - elapsedTimeInHours; if (timeInHours > 1) { System.out.println("PROJECTED: time=" + timeInHours + "hours: " + (cal.getTime())); System.out.println("ELAPSED: time=" + elapsedTimeInHours + "hours"); System.out.println("REMAINING: time=" + remainingTimeInHours + "hours"); } else { System.out.println("PROJECTED: time=" + timeInMinutes + "minutes: " + (cal.getTime())); System.out.println("ELAPSED: time=" + elapsedTimeInMinutes + "minutes"); System.out.println("REMAINING: time=" + remainingTimeInMinutes + "minutes"); } } return actions; } /** * Used to maintain the process in isolation from all other processes and * ensure that the processing does not alter any rows. * *

* This method cannot be changed. * *

Support DBvolution at * Patreon

* * @return an action list * @throws java.io.FileNotFoundException * @throws java.sql.SQLException */ @Override public final DBActionList script(DBDatabase db) throws FileNotFoundException, IOException, SQLException, Exception { DBActionList actions = new DBActionList(); List rows = getRows(db); Date startTime = new Date(); processRows(rows); Date finishTime = new Date(); double timePerRecord = (0.0 + finishTime.getTime() - startTime.getTime()) / getBoundIncrease(); System.out.println("PROCESSED: " + (getLowerBound()) + "-" + (getUpperBound() - getBoundIncrease()) + " (+" + getBoundIncrease() + ") at " + timePerRecord + "ms/record."); return actions; } @SuppressFBWarnings( value = "REC_CATCH_EXCEPTION", justification = "Database vendors throw many interesting exceptions") private List getRows(DBDatabase db) throws AccidentalCartesianJoinException, AccidentalBlankQueryException { List rows = null; this.rowCount = 0L; double timePerRecord = 10000.0; while (hasMoreRecords() && rows == null) { try { if (getLowerBound() > getMaxBound()) { setMoreRecords(false); } else { System.out.println("RETRIEVING: " + getLowerBound() + "-" + getUpperBound() + " (+" + getBoundIncrease() + ")"); DBQuery dbQuery = getQuery(db, getLowerBound(), getUpperBound()); setQueryTimeout(dbQuery); Date startTime = new Date(); if (this.countOnly) { rowCount = dbQuery.count(); rows = new ArrayList<>(); } else { rows = dbQuery.getAllRows(); rowCount = 0L + rows.size(); } Date finishTime = new Date(); final double timeTaken = 0.0 + finishTime.getTime() - startTime.getTime(); timePerRecord = timeTaken / getBoundIncrease(); System.out.println("RETRIEVED: " + getLowerBound() + "-" + getUpperBound() + " (+" + getBoundIncrease() + ") after " + timeTaken + " at " + timePerRecord + "ms/record."); } } catch (SQLException | AccidentalBlankQueryException | AccidentalCartesianJoinException ex) { if (getBoundIncrease() == getMinBoundIncrease()) { // We can't get this row so acknowledge the error System.out.println("Unable to access records: " + getLowerBound() + " - " + getUpperBound()); // and move on. stepForward(); System.out.println("Will retry from " + getLowerBound() + " - " + getUpperBound() + " (+" + getBoundIncrease() + ")."); } else { System.out.println("Stepping back from " + getLowerBound() + "-" + getUpperBound() + " and braking from +" + getBoundIncrease() + "."); brake(); System.out.println("Will retry from " + getLowerBound() + "-" + getUpperBound() + " (+" + getBoundIncrease() + ")."); } } } stepForward(timePerRecord); return rows; } private void stepForward(double timePerRecord) { setLowerBound(getLowerBound() + getBoundIncrease()); accelerateIfImproved(timePerRecord); } private void stepForward() { setLowerBound(getLowerBound() + getBoundIncrease()); } private void brake() { setBoundIncrease(getBoundIncrease() / 2); if (getBoundIncrease() < getMinBoundIncrease()) { setBoundIncrease(getMinBoundIncrease()); } } private void accelerate() { setBoundIncrease(getBoundIncrease() * 2); if (getBoundIncrease() > getMaxBoundIncrease()) { setBoundIncrease(getMaxBoundIncrease()); } } /** *

Support DBvolution at * Patreon

* * @return the noMoreRecords */ private boolean hasMoreRecords() { return moreRecords; } /** * @param noMoreRecords the noMoreRecords to set */ private void setMoreRecords(boolean noMoreRecords) { this.moreRecords = noMoreRecords; } private void accelerateIfImproved(double timePerRecord) { if (timePerRecord < previousTimePerRecord) { accelerate(); } else { brake(); } previousTimePerRecord = timePerRecord; } private int getUpperBound() { return getLowerBound() + getBoundIncrease(); } /** *

Support DBvolution at * Patreon

* * @return the maxBoundIncrease */ protected int getMaxBoundIncrease() { return maxBoundIncrease; } /** * Allows the programmer to specify a maximum difference between the lower- * and upper-bounds. * * @param maxBoundIncrease the maxBoundIncrease to set */ protected void setMaxBoundIncrease(int maxBoundIncrease) { this.maxBoundIncrease = maxBoundIncrease; } /** *

Support DBvolution at * Patreon

* * @return the MIN_BOUND_INCREASE */ private int getMinBoundIncrease() { return MIN_BOUND_INCREASE; } /** *

Support DBvolution at * Patreon

* * @return the boundIncrease */ protected int getBoundIncrease() { return boundIncrease; } /** * @param boundIncrease the boundIncrease to set */ private void setBoundIncrease(int boundIncrease) { this.boundIncrease = boundIncrease; } /** *

Support DBvolution at * Patreon

* * @return the maxBound */ protected int getMaxBound() { return maxBound; } /** * Allows the programmer to set the last number to be extracted. * * @param maxBound the maxBound to set */ protected void setMaxBound(int maxBound) { this.maxBound = maxBound; } /** *

Support DBvolution at * Patreon

* * @return the lowerBound */ protected int getLowerBound() { return lowerBound; } /** * @param lowerBound the lowerBound to set */ protected void setLowerBound(int lowerBound) { this.lowerBound = lowerBound; } /** * Changes the default timeout for the underlying query during this * extraction. * *

* DBvolution default timeout is 10000 milliseconds to avoid excessive * queries. * *

* Use this method to extend or reduce the timeout period as required. * * @param milliseconds */ protected void setTimeoutInMilliseconds(Integer milliseconds) { this.timeoutInMilliseconds = milliseconds; } private void setQueryTimeout(DBQuery query) { if (this.timeoutInMilliseconds == null) { query.clearTimeout(); } else { query.setTimeoutInMilliseconds(this.timeoutInMilliseconds); } } /** * Restrict this extraction to only returning the row count. * *

* The default is to return the rows found, however it is much more efficient * to only count the rows. * *

* Use the method to switch to only counting the rows. Handling the results of * the extraction is still done in {@link #processRows(java.util.List) } but * the list will be empty. *

* Use {@link #getRowCount() } to retrieve the row count within {@link #processRows(java.util.List) * }. */ public void setToCountOnly() { countOnly = true; } /** * Set this extraction to retrieve rows. * *

* This is the default behavior. * *

* Use this method to switch from {@link #setToCountOnly() } back to * retrieving the full collection of data. * *

* Processing of rows extracted is done in {@link #processRows(java.util.List) * }. * * */ public void setToRetrieveRows() { countOnly = false; } /** * Return the number of rows found by this iteration of the extraction. * *

* For use with {@link #setToCountOnly() } but also with {@link #setToRetrieveRows() * } * *

Support DBvolution at * Patreon

* * @return the number of rows found in the last partial extraction */ public Long getRowCount() { return rowCount; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy