All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionIterator Maven / Gradle / Ivy

There is a newer version: 1.1.0
Show newest version
/*******************************************************************************
 * Copyright (c) 2011 Ubiquitous Knowledge Processing Lab
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/lgpl.html
 *
 * Project Website:
 * 	http://jwpl.googlecode.com
 *
 * Contributors:
 * 	Torsten Zesch
 * 	Simon Kulessa
 * 	Oliver Ferschke
 ******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.revisionmachine.api;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.sql.Types;
import java.util.Iterator;

import de.tudarmstadt.ukp.wikipedia.api.DatabaseConfiguration;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.common.exceptions.DecodingException;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.common.util.Time;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.codec.RevisionDecoder;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.data.tasks.content.Diff;

/**
 * Part of the JWPL Revision API
 *
 * This class represents the interface to iterate through multiple revisions.
 *
 *
 *
 */
public class RevisionIterator
	implements RevisionIteratorInterface
{

	/** Reference to the configuration parameter variable */
	private final RevisionAPIConfiguration config;

	/** Reference to the database connection */
	private Connection connection;

	/** Reference to the ResultSet */
	private ResultSet result;

	/** Reference to the Statement */
	private PreparedStatement statement;

	/** Binary Data Flag */
	private boolean binaryData;

	/** Text of the previous revision */
	private String previousRevision;

	/** Current primary key */
	private int primaryKey;

	/** Primary key indicating the end of the data */
	private int endPK;

	/** ID of the current article */
	private int currentArticleID;

	/** The last known revision counter */
	private int currentRevCounter;

	/** Configuration parameter - indicates the maximum size of a querry. */
	private final int MAX_NUMBER_RESULTS;

	/** Should load revision text? */
	private boolean shouldLoadRevisionText;

	/**
	 * The revisionapi for this iterator - used by the Revision object
	 * in case of lazy loading
	 */
	private RevisionApi revApi= null;

	public boolean shouldLoadRevisionText()
	{
		return shouldLoadRevisionText;
	}

	public void setShouldLoadRevisionText(boolean shouldLoadRevisionText)
	{
		this.shouldLoadRevisionText = shouldLoadRevisionText;
	}

	/**
	 * (Constructor) Creates a new RevisionIterator object.
	 *
	 * @param config
	 *            Reference to the configuration object
	 * @param startPK
	 *            Start index
	 * @param endPK
	 *            End index
	 * @param connection
	 *            Reference to the connection
	 *
	 * @throws WikiApiException
	 *             if an error occurs
	 */
	public RevisionIterator(final RevisionAPIConfiguration config,
			final int startPK, final int endPK, final Connection connection)
		throws WikiApiException
	{

		if (startPK < 0 || endPK < 0 || startPK > endPK || connection == null) {
			throw new IllegalArgumentException("Illegal argument");
		}

		this.primaryKey = startPK - 1;
		this.endPK = endPK;
		this.config = config;

		this.currentArticleID = -1;
		this.currentRevCounter = -1;

		MAX_NUMBER_RESULTS = config.getBufferSize();

		this.connection = connection;
	}

	/**
	 * (Constructor) Creates a new RevisionIterator object.
	 *
	 * @param config
	 *            Reference to the configuration object
	 * @param startPK
	 *            Start index
	 *
	 * @throws WikiApiException
	 *             if an error occurs
	 */
	public RevisionIterator(final RevisionAPIConfiguration config,
			final int startPK)
		throws WikiApiException
	{

		this(config);

		if (startPK < 0) {
			throw new IllegalArgumentException("Illegal argument");
		}

		this.primaryKey = startPK - 1;
	}

	/**
	 * (Constructor) Creates a new RevisionIterator object.
	 *
	 * @param config
	 *            Reference to the configuration object
	 * @param startPK
	 *            Start index
	 * @param endPK
	 *            End index
	 *
	 * @throws WikiApiException
	 *             if an error occurs
	 */
	public RevisionIterator(final RevisionAPIConfiguration config,
			final int startPK, final int endPK)
		throws WikiApiException
	{

		this(config, startPK);

		if (endPK < 0 || startPK > endPK) {
			throw new IllegalArgumentException("Illegal argument");
		}

		this.endPK = endPK;
	}

	/**
	 * (Constructor) Creates a new RevisionIterator object.
	 *
	 * @param config
	 *            Reference to the configuration object
	 *
	 * @throws WikiApiException
	 *             if an error occurs
	 */
	public RevisionIterator(final RevisionAPIConfiguration config)
		throws WikiApiException
	{

		this.config = config;
		try {
			this.primaryKey = -1;
			this.endPK = Integer.MAX_VALUE;

			this.statement = null;
			this.result = null;
			this.previousRevision = null;

			MAX_NUMBER_RESULTS = config.getBufferSize();

			connect();
		}
		catch (SQLException e) {
			throw new WikiApiException(e);
		}
	}

	/**
	 * (Constructor) Creates a new RevisionIterator object.
	 *
	 * @param config
	 *            Reference to the configuration object
	 * @param shouldLoadRevisionText
	 *            should load revision text
	 * @throws WikiApiException
	 *             if an error occurs
	 */
	public RevisionIterator(final RevisionAPIConfiguration config,
			boolean shouldLoadRevisionText)
		throws WikiApiException
	{
		this(config);
		this.shouldLoadRevisionText = shouldLoadRevisionText;
	}

	public RevisionIterator(final DatabaseConfiguration db)
		throws WikiApiException
	{
		this(getRevisionAPIConfig(db));
	}

	private static RevisionAPIConfiguration getRevisionAPIConfig(
			final DatabaseConfiguration db)
	{
		RevisionAPIConfiguration revAPIConfig = new RevisionAPIConfiguration();

		revAPIConfig.setHost(db.getHost());
		revAPIConfig.setDatabase(db.getDatabase());
		revAPIConfig.setUser(db.getUser());
		revAPIConfig.setPassword(db.getPassword());
		revAPIConfig.setLanguage(db.getLanguage());

		return revAPIConfig;
	}

	/**
	 * Sends the query to the database and stores the result. The statement and
	 * resultset connection will not be closed.
	 *
	 * @return TRUE, if the result set has another element FALSE, otherwise
	 *
	 * @throws SQLException
	 *             if an error occurs while accessing the database.
	 */
	private boolean query()
		throws SQLException
	{
		String query = "SELECT PrimaryKey, Revision, RevisionCounter,"
				+ " RevisionID, ArticleID, Timestamp, FullRevisionID "
				+ "FROM revisions";

		if (primaryKey > 0) {
			query += " WHERE PrimaryKey > " + primaryKey;
		}

		if (MAX_NUMBER_RESULTS > 0) {
			query += " LIMIT ";

			if (primaryKey + MAX_NUMBER_RESULTS > endPK) {
				query += (endPK - primaryKey + 1); // TODO: +1 ?
			}
			else {
				query += MAX_NUMBER_RESULTS;
			}

		}
		else if (endPK != Integer.MAX_VALUE) {
			query += " LIMIT " + (endPK - primaryKey + 1);
		}

		try{
			statement=this.connection.prepareStatement(query);
			result = statement.executeQuery(query);
		}catch(Exception e){
			System.err.println("Conncection Closed: "+connection.isClosed());
			System.err.println("Connection Valid: "+connection.isValid(5));
			connect();
			statement=this.connection.prepareStatement(query);
			result = statement.executeQuery(query);
		}


		if (result.next()) {
			binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY;
			return true;
		}

		return false;
	}

	/**
	 * Returns the next revision.
	 *
	 * @return next revision
	 */
	@Override
	public Revision next()
	{
		try {

			int revCount, articleID;

			revCount = result.getInt(3);
			articleID = result.getInt(5);

			if (articleID != this.currentArticleID) {
				this.currentRevCounter = 0;
				this.currentArticleID = articleID;
			}

			if (revCount - 1 != this.currentRevCounter) {

				System.err.println("\nInvalid RevCounter -" + " [ArticleId "
						+ articleID + ", RevisionId " + result.getInt(4)
						+ ", RevisionCounter " + result.getInt(3)
						+ "] - Expected: " + (this.currentRevCounter + 1));

				this.currentRevCounter = revCount;
				this.previousRevision = null;

				return null;
			}

			this.currentRevCounter = revCount;



			this.primaryKey = result.getInt(1);

			Revision revision = new Revision(revCount);
			revision.setPrimaryKey(this.primaryKey);
			if (!shouldLoadRevisionText) {
				String currentRevision;

				Diff diff;
				RevisionDecoder decoder = new RevisionDecoder(
						config.getCharacterSet());

				if (binaryData) {
					decoder.setInput(result.getBinaryStream(2), true);
				}
				else {
					decoder.setInput(result.getString(2));
				}
				diff = decoder.decode();

				try {
					currentRevision = diff.buildRevision(previousRevision);
				}
				catch (Exception e) {
					this.previousRevision = null;
					System.err.println("Reconstruction failed -"
							+ " [ArticleId " + result.getInt(5)
							+ ", RevisionId " + result.getInt(4)
							+ ", RevisionCounter " + result.getInt(3) + "]");
					return null;
				}

				previousRevision = currentRevision;
				revision.setRevisionText(currentRevision);
			} else {
				if(revApi==null){
					revApi = new RevisionApi(config);
				}
				revision.setRevisionApi(revApi);
			}

			revision.setRevisionID(result.getInt(4));
			revision.setArticleID(articleID);
			revision.setTimeStamp(new Timestamp(result.getLong(6)));
			revision.setFullRevisionID(result.getInt(7));

			return revision;

		}
		catch (DecodingException e) {
			throw new RuntimeException(e);
		}
		catch (SQLException e) {
			throw new RuntimeException(e);
		}
		catch (IOException e) {
			throw new RuntimeException(e);
		}
		catch (WikiApiException e) {
			throw new RuntimeException(e);
		}
	}

	/**
	 * Returns whether another revision is available or not.
	 *
	 * @return TRUE or FALSE
	 */
	@Override
	public boolean hasNext()
	{
		try {
			if (result != null && result.next()) {
				return true;
			}

			// Close old queries
			if (this.statement != null) {
				this.statement.close();
			}
			if (this.result != null) {
				this.result.close();
			}

			if (primaryKey <= endPK) { // TODO: <= ?
				return query();
			}

			return false;

		}
		catch (SQLException e) {
			throw new RuntimeException(e);
		}
	}

	/**
	 * This method is unsupported.
	 *
	 * @deprecated
	 * @throws UnsupportedOperationException
	 */
	@Override
	@Deprecated
	public void remove()
	{
		throw new UnsupportedOperationException();
	}

	/**
	 * This method closes the connection to the input component.
	 *
	 * @throws SQLException
	 *             if an error occurs while closing the connection to the
	 *             database.
	 */
	@Override
	public void close()
		throws SQLException
	{
		if (this.connection != null) {
			this.connection.close();
		}
	}

	public static void main(final String[] args)
		throws Exception
	{

		RevisionAPIConfiguration config = new RevisionAPIConfiguration();
		config.setHost("localhost");
		config.setDatabase("en_wiki");
		config.setUser("root");
		config.setPassword("1234");

		config.setCharacterSet("UTF-8");
		config.setBufferSize(20000);
		config.setMaxAllowedPacket(16 * 1024 * 1023);

		long count = 1;
		long start = System.currentTimeMillis();

		Revision rev;
		Iterator it = new RevisionIterator(config);

		System.out.println(Time.toClock(System.currentTimeMillis() - start));

		while (it.hasNext()) {
			rev = it.next();

			if (count++ % 10000 == 0) {

				if (rev != null) {
					System.out.println(rev.toString());
				}
			}
		}

		// w.close();
		System.out.println(Time.toClock(System.currentTimeMillis() - start));
	}

	public void connect()
		throws SQLException
	{
		if (this.connection != null) {
			this.connection.close();
			System.out.println("Reconnect to Database");
		}
		else {
			System.out.println("Connect to Database");
		}

		try{
			String driverDB = "com.mysql.jdbc.Driver";
			Class.forName(driverDB);
		}catch(ClassNotFoundException e){
			System.err.println("JDBC Driver is missing");
		}

		this.connection = DriverManager.getConnection(
						"jdbc:mysql://" + this.config.getHost()
								+ "/" + this.config.getDatabase(),
						this.config.getUser(), this.config.getPassword());
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy