All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.tools.dbselection.DBHandler Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.tools.dbselection;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;

import marytts.util.Pair;
import marytts.util.io.FileUtils;

import org.mediawiki.importer.DumpWriter;
import org.mediawiki.importer.SqlServerStream;
import org.mediawiki.importer.SqlStream;
import org.mediawiki.importer.SqlWriter;
import org.mediawiki.importer.SqlWriter15;
import org.mediawiki.importer.XmlDumpReader;

/**
 * Various functions for handling connection, inserting and querying a mysql database.
 * 
 * @author Marcela Charfuelan, Holmer Hemsen.
 */
public class DBHandler {

	private String locale = "en_US";
	private Connection cn = null;

	private Statement st = null;
	private ResultSet rs = null;
	private String currentTable = null;
	private PreparedStatement psSentence = null;
	private PreparedStatement psSelectedSentence = null;
	private PreparedStatement psWord = null;
	private PreparedStatement psCleanText = null;
	private PreparedStatement psTablesDescription = null;

	private String cleanTextTableName = "_cleanText";
	private String wordListTableName = "_wordList";
	private String dbselectionTableName = "_dbselection";
	private String selectedSentencesTableName = "_selectedSentences";

	/**
	 * The constructor loads the database driver.
	 * 
	 * @param localeVal
	 *            database language.
	 */
	public DBHandler(String localeVal) {
		initDB_Driver();
		locale = localeVal;
		cleanTextTableName = locale + cleanTextTableName;
		wordListTableName = locale + wordListTableName;
		dbselectionTableName = locale + dbselectionTableName;
		selectedSentencesTableName = locale + selectedSentencesTableName;
		System.out.println("\nMysql driver loaded, set locale=" + locale);
	}

	/** Loading DB Driver */
	private void initDB_Driver() {
		try {
			String driver = "com.mysql.jdbc.Driver";
			Class.forName(driver);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/***
	 * By default the name of the selected sentence is "locale + _selectedSentences" with this function the name can be changed,
	 * the locale prefix will be kept and the suffix "_selectedSentences". NOTE: If the name is changed this function should be
	 * called before calling createDBConnection() because ther prepared statments for each table are initialised.
	 * 
	 * @param name
	 *            name
	 */
	public void setSelectedSentencesTableName(String name) {
		if (name.contentEquals(""))
			selectedSentencesTableName = locale + "_selectedSentences";
		else
			selectedSentencesTableName = locale + "_" + name + "_selectedSentences";
		System.out.println("Current selected sentences table name = " + selectedSentencesTableName);
	}

	public String getSelectedSentencesTableName() {
		return selectedSentencesTableName;
	}

	public String getCleanTextTableName() {
		return cleanTextTableName;
	}

	public String getWordListTableName() {
		return wordListTableName;
	}

	public String getDBselectionTableName() {
		return dbselectionTableName;
	}

	/**
	 * The createDBConnection method creates the database connection.
	 *
	 * @param host
	 *            a String value. The database host e.g. 'localhost'.
	 * @param db
	 *            a String value. The database to connect to.
	 * @param user
	 *            a String value. Database user that has excess to the database.
	 * @param passwd
	 *            a String value. The 'secret' password.
	 * @return true if connection was succesfull, false otherwise
	 */
	public boolean createDBConnection(String host, String db, String user, String passwd) {
		boolean result = false;
		String url = "jdbc:mysql://" + host + "/" + db + "?jdbcCompliantTruncation=false";
		try {
			Properties p = new Properties();
			p.put("user", user);
			p.put("password", passwd);
			p.put("database", db);
			p.put("useUnicode", "true");
			p.put("characterEncoding", "utf8");

			cn = DriverManager.getConnection(url, p);

			st = cn.createStatement();

			psCleanText = cn.prepareStatement("INSERT INTO " + cleanTextTableName + " VALUES (null, ?, ?, ?, ?)");
			psWord = cn.prepareStatement("INSERT INTO " + wordListTableName + " VALUES (null, ?, ?)");
			psSentence = cn.prepareStatement("INSERT INTO " + dbselectionTableName + " VALUES (null, ?, ?, ?, ?, ?, ?, ?, ?)");
			psSelectedSentence = cn.prepareStatement("INSERT INTO " + selectedSentencesTableName + " VALUES (null, ?, ?, ?)");
			psTablesDescription = cn.prepareStatement("INSERT INTO tablesDescription VALUES (null, ?, ?, ?, ?, ?, ?, ?)");

			result = true;
			System.out.println("Mysql connection created successfully.");

		} catch (Exception e) {
			System.out.println("Problems creating Mysql connection.");
			e.printStackTrace();
		}
		return result;
	}

	/***
	 * Use mwdumper for extracting pages from a XML wikipedia dump file. The mwdumper reads a xml wikipedia file and extract the
	 * tables "text", "page" and "revision" in sql format. In this configuration the mwdumper creates a connection to the DB and
	 * load directly the tables, so the tables must be already created and they should be empty. Here these tables are created
	 * empty using the function createEmptyWikipediaTables, this function deletes any existing text, page and revision table
	 * before creating new ones.
	 * 
	 * @param xmlFile
	 *            xml dump file
	 * @param lang
	 *            locale language
	 * @param host
	 *            host
	 * @param db
	 *            db
	 * @param user
	 *            user
	 * @param passwd
	 *            passwd
	 * @throws Exception
	 *             Exception
	 */
	public void loadPagesWithMWDumper(String xmlFile, String lang, String host, String db, String user, String passwd)
			throws Exception {

		DBHandler wikiDB = new DBHandler(lang);
		System.out
				.println("Using mwdumper to convert xml file into sql source file and loading text, page and revision tables into the DB.");
		System.out.println("Creating connection to DB server...");
		wikiDB.createDBConnection(host, db, user, passwd);

		// Before runing the mwdumper the tables text, page and revision should be created empty.
		// If the tables exist, it could be that other user/process is using these tables so in that case
		// the program will stop. If no other user/program is using these tables the user has the option of deleting them.
		if (wikiDB.createEmptyWikipediaTables()) {

			// Very old: dump into sql file:
			// Run the mwdumper jar file xml -> sql
			// Use these parameters for saving the output in a dump.sql source file
			/*
			 * String sqlDump = "dump.sql"; String[] argsDump = new String[3]; argsDump[0] = "--output=file:"+sqlDump; argsDump[1]
			 * = "--format=sql:1.5"; argsDump[2] = xmlFile;
			 */

			/*
			 * Old: run Dumper as if it was called from the command line: // Use these parameters for loading the pages direclty
			 * in the DB String[] argsDump = new String[3]; argsDump[0] = "--output=mysql://" + host + "/" + db + "?user=" + user
			 * + "&password=" + passwd + "&useUnicode=true&characterEncoding=utf8"; argsDump[1] = "--format=sql:1.5"; argsDump[2]
			 * = xmlFile;
			 * 
			 * //--- The following ClassLoader code from: //
			 * http://java.sun.com/docs/books/tutorial/deployment/jar/examples/JarClassLoader.java // Class c = loadClass(name);
			 * // this does not work (example from sun) // Class c = Class.forName("org.mediawiki.dumper.Dumper"); // this works
			 * Class c = ClassLoader.getSystemClassLoader().loadClass("org.mediawiki.dumper.Dumper"); // this also works Method m
			 * = c.getMethod("main", new Class[] { argsDump.getClass() }); m.setAccessible(true); int mods = m.getModifiers(); if
			 * (m.getReturnType() != void.class || !Modifier.isStatic(mods) || !Modifier.isPublic(mods)) { throw new
			 * NoSuchMethodException("main"); } try { m.invoke(null, new Object[] { argsDump });
			 * 
			 * } catch (IllegalAccessException e) { // This should not happen, as we have disabled access checks }
			 */

			// Call mwdumper code directly:
			SqlStream sqlStream = new SqlServerStream(wikiDB.cn);
			DumpWriter sqlWriter = new SqlWriter15(new SqlWriter.MySQLTraits(), sqlStream);
			File tmpFile = new File(xmlFile + "_filtered");
			BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(xmlFile), "UTF-8"));
			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFile), "UTF-8"));
			int read = -1;
			char[] buf = new char[4096];
			while ((read = in.read(buf)) > 0) {
				out.write(buf, 0, read);
			}
			in.close();
			out.close();
			InputStream filteredInput = new BufferedInputStream(new FileInputStream(tmpFile));
			XmlDumpReader xmlReader = new XmlDumpReader(filteredInput, sqlWriter);
			xmlReader.readDump(); // this will close wikiDB.cn, so that it will have to be opened again:
			tmpFile.delete();
			if (wikiDB.cn.isClosed()) {
				wikiDB.createDBConnection(host, db, user, passwd);
			}

			// Now I need to add/change the prefix locale to the table names
			// Renaming tables
			wikiDB.addLocalePrefixToWikipediaTables(); // this change the name of already created and loaded tables

		} else {
			wikiDB.closeDBConnection();
			// problems creating the tables
			System.exit(1);
		}

	}

	/****************************************************************************************
	 * FUNCTIONS FOR CREATING TABLES
	 ****************************************************************************************/
	/***
	 * Ask the user if the table should be deleted
	 * 
	 * @param table
	 *            table
	 * @return true if user answers false otherwise.
	 */
	public boolean askIfDeletingTable(String table) {
		char c;
		boolean result = false;
		InputStreamReader isr = new InputStreamReader(System.in);
		BufferedReader br = new BufferedReader(isr);
		boolean wikipediaTables = false;

		String helpToContinue = "\n ***To continue please check if the table \""
				+ table
				+ "\" is used by another user/process.*** \n"
				+ "    Tables \"text\", \"page\" and \"revision\" are temporary tables used by mwdumper to extract wikipedia pages. \n"
				+ "    Just one user/process can use these tables at the same time in the same DataBase, please use other DataBase if possible.\n"
				+ "    If the tables are not in use by any other user/process please use the option for deleting.\n";

		if (table.contentEquals("text") || table.contentEquals("page") || table.contentEquals("revision")) {
			System.out.println(helpToContinue);
			wikipediaTables = true;
		}

		System.out.print("    TABLE = \"" + table + "\" already exists deleting (y/n)?");
		try {
			String s = br.readLine();
			if (s.contentEquals("y")) {
				result = true;
			} else {
				if (wikipediaTables)
					System.out.println("\nTo continue please check if the table \"" + table + "\" can be deleted "
							+ "or is used by another user/process.\n");
				else
					System.out.println("    Adding data to TABLE = \"" + table + "\".\n");
				result = false;
			}

		} catch (Exception e) {
			System.out.println(e);
		}
		return result;
	}

	/***
	 * This table contains information about tables in the DB, specially for selected sentences tables.
	 */
	public void createTablesDescriptionTable() {

		String tables = "CREATE TABLE IF NOT EXISTS tablesDescription ( id INT NOT NULL AUTO_INCREMENT, " + "name TINYTEXT,"
				+ "description MEDIUMTEXT," + "stopCriterion TINYTEXT," + "featuresDefinitionFileName TINYTEXT,"
				+ "featuresDefinitionFile MEDIUMTEXT," + "covDefConfigFileName TINYTEXT," + "covDefConfigFile MEDIUMTEXT,"
				+ "primary key(id)) CHARACTER SET utf8;";
		try {
			boolean res = st.execute(tables);
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/***
	 * Creates dbselectionTable
	 * 
	 * 
	 */
	public void createDataBaseSelectionTable() {
		String dbselection = "CREATE TABLE " + dbselectionTableName + " ( id INT NOT NULL AUTO_INCREMENT, "
				+ "sentence MEDIUMBLOB NOT NULL, " + "features BLOB, " + "reliable BOOLEAN, " + "unknownWords BOOLEAN, "
				+ "strangeSymbols BOOLEAN, " + "selected BOOLEAN, " + "unwanted BOOLEAN, "
				+ "cleanText_id INT UNSIGNED NOT NULL, " + // the cleanText id where this sentence comes from
				"primary key(id)) CHARACTER SET utf8;";
		String str;
		boolean dbExist = false;
		// if database does not exist create it, if it exists it will continue adding sentences to this table
		System.out.println("Checking if " + dbselectionTableName + " already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}

		try {
			while (rs.next()) {
				str = rs.getString(1);
				if (str.contentEquals(dbselectionTableName)) {
					System.out.println("TABLE = " + str + " already exist, adding sentences to this table.");
					dbExist = true;
				}
			}
			boolean res;

			// if DB exists does not exist it will be created
			if (!dbExist) {
				res = st.execute(dbselection);
				System.out.println("TABLE = " + dbselectionTableName + " succesfully created.");
			}

		} catch (SQLException e) {
			e.printStackTrace();
		}

	}

	/***
	 * Creates a selectedSentencesTable.
	 *
	 * @param stopCriterion
	 *            stopCriterion
	 * @param featDefFileName
	 *            featDefFileName
	 * @param covDefConfigFileName
	 *            covDefConfigFileName
	 */
	public void createSelectedSentencesTable(String stopCriterion, String featDefFileName, String covDefConfigFileName) {
		String selected = "CREATE TABLE " + selectedSentencesTableName + " ( id INT NOT NULL AUTO_INCREMENT, "
				+ "sentence MEDIUMBLOB NOT NULL, " + "unwanted BOOLEAN, " + "dbselection_id INT UNSIGNED NOT NULL, " + // the
																														// dbselection
																														// id
																														// where
																														// this
																														// sentence
																														// comes
																														// from
				"primary key(id)) CHARACTER SET utf8;";

		String str;
		boolean dbExist = false;
		// if database does not exist create it
		System.out.println("\nChecking if " + selectedSentencesTableName + " already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}

		try {
			while (rs.next()) {
				str = rs.getString(1);
				if (str.contentEquals(selectedSentencesTableName)) {
					System.out.println("  TABLE = " + str + " already exist. New selected sentences "
							+ "will be added to this table.");
					dbExist = true;
				}
			}

			// Ask if delete?
			// if(dbExist && !askIfDeletingTable(selectedSentencesTableName) )
			// result = false;

			if (!dbExist) {
				// if creating a new table then the set the fields selected=false and unwanted=false
				// in dbselection table
				System.out.println("  TABLE = " + selectedSentencesTableName + " does not exist, creating a new table and ");
				// System.out.println("  Initialising fields selected=false and unwanted=false in TABLE = " +
				// dbselectionTableName);
				System.out.println("  Initialising fields selected=false in TABLE = " + dbselectionTableName
						+ " (Previously selected sentences marked as unwanted will be kept)");
				updateTable("UPDATE " + dbselectionTableName + " SET selected=false;");
				// updateTable("UPDATE " + dbselectionTableName + " SET unwanted=false;");
				boolean res = st.execute(selected);
				System.out.println("  TABLE = " + selectedSentencesTableName + " succesfully created.");

			}

		} catch (SQLException e) {
			e.printStackTrace();
		}

	}

	/***
	 * This function creates text, page and revision tables loading them from text files.
	 * 
	 * @param textFile
	 *            textFile
	 * @param pageFile
	 *            pageFile
	 * @param revisionFile
	 *            revisionFile
	 */
	public void createAndLoadWikipediaTables(String textFile, String pageFile, String revisionFile) {

		String createTextTable = "CREATE TABLE " + locale + "_text (" + " old_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ " old_text mediumblob NOT NULL," + " old_flags tinyblob NOT NULL," + " PRIMARY KEY old_id (old_id)"
				+ " ) MAX_ROWS=250000 AVG_ROW_LENGTH=10240;";

		String createPageTable = "CREATE TABLE " + locale + "_page (" + "page_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ "page_namespace int(11) NOT NULL," + "page_title varchar(255) NOT NULL,"
				+ "page_restrictions tinyblob NOT NULL," + "page_counter bigint(20) unsigned NOT NULL,"
				+ "page_is_redirect tinyint(3) unsigned NOT NULL," + "page_is_new tinyint(3) unsigned NOT NULL,"
				+ "page_random double unsigned NOT NULL," + "page_touched binary(14) NOT NULL,"
				+ "page_latest int(10) unsigned NOT NULL," + "page_len int(10) unsigned NOT NULL,"
				+ "PRIMARY KEY page_id (page_id)," + "KEY page_namespace (page_namespace)," + "KEY page_random (page_random),"
				+ "KEY page_len (page_len) ) MAX_ROWS=250000 AVG_ROW_LENGTH=10240; ";

		String createRevisionTable = "CREATE TABLE " + locale + "_revision (" + "rev_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ "rev_page int(10) unsigned NOT NULL," + "rev_text_id int(10) unsigned NOT NULL,"
				+ "rev_comment tinyblob NOT NULL," + "rev_user int(10) unsigned NOT NULL,"
				+ "rev_user_text varchar(255) NOT NULL, " + "rev_timestamp binary(14) NOT NULL, "
				+ "rev_minor_edit tinyint(3) unsigned NOT NULL," + " rev_deleted tinyint(3) unsigned NOT NULL,"
				+ "rev_len int(10) unsigned NULL," + "rev_parent_id int(10) unsigned NULL,"
				+ "KEY rev_user (rev_user),KEY rev_user_text (rev_user_text)," + "KEY rev_timestamp (rev_timestamp),"
				+ "PRIMARY KEY rev_id (rev_id)) MAX_ROWS=250000 AVG_ROW_LENGTH=10240;";

		// If database does not exist create it, if it exists delete it and create an empty one.
		// System.out.println("Checking if the TABLE=text already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false, resPage = false, resRevision = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals(locale + "_text"))
					resText = true;
				else if (str.contentEquals(locale + "_page"))
					resPage = true;
				else if (str.contentEquals(locale + "_revision"))
					resRevision = true;

			}
			if (resText) {
				System.out.println("TABLE = " + locale + "_text already exist deleting.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_text;");
			}
			if (resPage) {
				System.out.println("TABLE = " + locale + "_page already exist deleting.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_page;");
			}
			if (resRevision) {
				System.out.println("TABLE = " + locale + "_revision already exist deleting.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_revision;");
			}

			boolean res1;
			int res2;
			// creating TABLE=text
			// System.out.println("\nCreating table:" + createTextTable);
			System.out.println("\nCreating table:" + locale + "_text");
			res1 = st.execute(createTextTable);
			System.out.println("Loading sql file: " + textFile);
			res2 = st.executeUpdate("LOAD DATA LOCAL INFILE '" + textFile + "' into table " + locale + "_text;");
			System.out.println("TABLE = " + locale + "_text succesfully created.");

			// creating TABLE=page
			// System.out.println("\nCreating table:" + createPageTable);
			System.out.println("\nCreating table:" + locale + "_page");
			res1 = st.execute(createPageTable);
			System.out.println("Loading sql file: " + pageFile);
			res2 = st.executeUpdate("LOAD DATA LOCAL INFILE '" + pageFile + "' into table " + locale + "_page;");
			System.out.println("TABLE = " + locale + "_page succesfully created.");

			// creating TABLE=revision
			// System.out.println("\n\nCreating table:" + createRevisionTable);
			System.out.println("\nCreating table:" + locale + "_revision");
			res1 = st.execute(createRevisionTable);
			System.out.println("Loading sql file: " + revisionFile);
			System.out.println("SOURCE " + revisionFile + ";");
			res2 = st.executeUpdate("LOAD DATA LOCAL INFILE '" + revisionFile + "' into table " + locale + "_revision;");
			System.out.println("TABLE = " + locale + "_revision succesfully created.");

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/***
	 * This function creates empty text, page and revision tables (without locale prefix). If the tables already exist it will
	 * delete them.
	 * 
	 * @return true if the tables were created succesfully, false otherwise.
	 */
	public boolean createEmptyWikipediaTables() {

		boolean result = true;
		System.out.println("Creating empty wikipedia tables: checking if tables text, page and revision already exist.");
		String createTextTable = "CREATE TABLE text (" + " old_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ " old_text mediumblob NOT NULL," + " old_flags tinyblob NOT NULL," + " PRIMARY KEY old_id (old_id)"
				+ " ) MAX_ROWS=250000 AVG_ROW_LENGTH=10240 CHARACTER SET utf8;";

		String createPageTable = "CREATE TABLE page (" + "page_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ "page_namespace int(11) NOT NULL," + "page_title varchar(255) NOT NULL,"
				+ "page_restrictions tinyblob NOT NULL," + "page_counter bigint(20) unsigned NOT NULL,"
				+ "page_is_redirect tinyint(3) unsigned NOT NULL," + "page_is_new tinyint(3) unsigned NOT NULL,"
				+ "page_random double unsigned NOT NULL," + "page_touched binary(14) NOT NULL,"
				+ "page_latest int(10) unsigned NOT NULL," + "page_len int(10) unsigned NOT NULL,"
				+ "PRIMARY KEY page_id (page_id)," + "KEY page_namespace (page_namespace)," + "KEY page_random (page_random),"
				+ "KEY page_len (page_len) ) MAX_ROWS=250000 AVG_ROW_LENGTH=10240 CHARACTER SET utf8;";

		String createRevisionTable = "CREATE TABLE revision (" + "rev_id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ "rev_page int(10) unsigned NOT NULL," + "rev_text_id int(10) unsigned NOT NULL,"
				+ "rev_comment tinyblob NOT NULL," + "rev_user int(10) unsigned NOT NULL,"
				+ "rev_user_text varchar(255) NOT NULL, " + "rev_timestamp binary(14) NOT NULL, "
				+ "rev_minor_edit tinyint(3) unsigned NOT NULL," + " rev_deleted tinyint(3) unsigned NOT NULL,"
				+ "rev_len int(10) unsigned NULL," + "rev_parent_id int(10) unsigned NULL,"
				+ "KEY rev_user (rev_user),KEY rev_user_text (rev_user_text)," + "KEY rev_timestamp (rev_timestamp),"
				+ "PRIMARY KEY rev_id (rev_id)) MAX_ROWS=250000 AVG_ROW_LENGTH=10240 CHARACTER SET utf8;";

		// If database does not exist create it, if it exists delete it and create an empty one.
		// System.out.println("Checking if the TABLE=text already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false, resPage = false, resRevision = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals("text"))
					resText = true;
				else if (str.contentEquals("page"))
					resPage = true;
				else if (str.contentEquals("revision"))
					resRevision = true;

			}
			boolean res0, res1;
			if (resText && !askIfDeletingTable("text"))
				result = false;
			else if (resPage && !askIfDeletingTable("page"))
				result = false;
			else if (resRevision && !askIfDeletingTable("revision"))
				result = false;

			if (result) {
				res0 = st.execute("DROP TABLE IF EXISTS text;");
				System.out.println("  Creating table: text");
				res1 = st.execute(createTextTable);
				res0 = st.execute("DROP TABLE IF EXISTS page;");
				System.out.println("  Creating table: page");
				res1 = st.execute(createPageTable);
				res0 = st.execute("DROP TABLE IF EXISTS revision;");
				System.out.println("  Creating table: revision");
				res1 = st.execute(createRevisionTable);
			}

		} catch (SQLException e) {
			e.printStackTrace();
		}

		return result;
	}

	/****
	 * Rename the Wikipedia tables adding the prefix locale: locale_text locale_page and locale_revision. if any of the locale_*
	 * tables exist will be deleted before renaming the table.
	 */
	public void addLocalePrefixToWikipediaTables() {

		System.out.println("Adding local prefix " + locale + " to tables text, page and revision, checking if "
				+ "already exist tables with that prefix.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false, resPage = false, resRevision = false;
		boolean resLocaleText = false, resLocalePage = false, resLocaleRevision = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals("text"))
					resText = true;
				else if (str.contentEquals(locale + "_text"))
					resLocaleText = true;
				else if (str.contentEquals("page"))
					resPage = true;
				else if (str.contentEquals(locale + "_page"))
					resLocalePage = true;
				else if (str.contentEquals("revision"))
					resRevision = true;
				else if (str.contentEquals(locale + "_revision"))
					resLocaleRevision = true;
			}
			if (resLocaleText) {
				System.out.println("  Deleting TABLE = " + locale + "_text.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_text;");
			}
			if (resLocalePage) {
				System.out.println("  Deleting TABLE = " + locale + "_page.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_page;");
			}
			if (resLocaleRevision) {
				System.out.println("  Deleting TABLE = " + locale + "_revision.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_revision;");
			}

			if (resText) {
				System.out.println("  RENAME TABLE = text TO " + locale + "_text.");
				boolean res0 = st.execute("RENAME TABLE text TO " + locale + "_text;");
			}
			if (resPage) {
				System.out.println("  RENAME TABLE = page TO " + locale + "_page.");
				boolean res0 = st.execute("RENAME TABLE page TO " + locale + "_page;");
			}
			if (resRevision) {
				System.out.println("  RENAME TABLE = revision TO " + locale + "_revision.");
				boolean res0 = st.execute("RENAME TABLE revision TO " + locale + "_revision;");
			}

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/****
	 * Delete the Wikipedia tables: locale_text, locale_page and locale_revision tables.
	 *
	 */
	public void deleteWikipediaTables() {

		System.out.println("Deleting already used wikipedia tables.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false, resPage = false, resRevision = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals(locale + "_text"))
					resText = true;
				else if (str.contentEquals(locale + "_page"))
					resPage = true;
				else if (str.contentEquals(locale + "_revision"))
					resRevision = true;

			}
			if (resText) {
				System.out.println("  Deleting TABLE = " + locale + "_text.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_text;");
			}
			if (resPage) {
				System.out.println("  Deleting TABLE = " + locale + "_page.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_page;");
			}
			if (resRevision) {
				System.out.println("  Deleting TABLE = " + locale + "_revision.");
				boolean res0 = st.execute("DROP TABLE " + locale + "_revision;");
			}

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/***
	 * check if tables: locale_text, locale_page and locale_revision exist.
	 * 
	 * @return resText && resPage && resRevision
	 */
	public boolean checkWikipediaTables() {
		// wiki must be already created

		// If database does not exist create it, if it exists delete it and create an empty one.
		System.out.println("Checking if the TABLE=" + locale + "_text already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false, resPage = false, resRevision = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals(locale + "_text"))
					resText = true;
				else if (str.contentEquals(locale + "_page"))
					resPage = true;
				else if (str.contentEquals(locale + "_revision"))
					resRevision = true;

			}
			if (resText)
				System.out.println("TABLE =" + locale + "_text already exist.");
			if (resPage)
				System.out.println("TABLE =" + locale + "_page already exist.");
			if (resRevision)
				System.out.println("TABLE =" + locale + "_revision already exist.");

		} catch (SQLException e) {
			e.printStackTrace();
		}

		return resText && resPage && resRevision;
	}

	public void createWikipediaCleanTextTable() {
		// wiki must be already created
		// String creteWiki = "CREATE DATABASE wiki;";
		String createCleanTextTable = "CREATE TABLE " + cleanTextTableName + " (" + " id int UNSIGNED NOT NULL AUTO_INCREMENT,"
				+ " cleanText MEDIUMBLOB NOT NULL," + " processed BOOLEAN, " + " page_id int UNSIGNED NOT NULL, "
				+ " text_id int UNSIGNED NOT NULL, " + " PRIMARY KEY id (id)"
				+ " ) MAX_ROWS=250000 AVG_ROW_LENGTH=10240 CHARACTER SET utf8;";

		// If database does not exist create it, if it exists delete it and create an empty one.
		System.out.println("Checking if the TABLE=" + cleanTextTableName + " already exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean resText = false;
		try {

			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals(cleanTextTableName))
					resText = true;
			}
			if (resText) {
				System.out.println("TABLE = " + cleanTextTableName + " already exist deleting.");
				boolean res0 = st.execute("DROP TABLE " + cleanTextTableName + ";");
			}

			boolean res1;
			int res2;
			// creating TABLE=cleanText
			// System.out.println("\nCreating table:" + createCleanTextTable);
			System.out.println("\nCreating table:" + cleanTextTableName);
			res1 = st.execute(createCleanTextTable);
			System.out.println("TABLE = " + cleanTextTableName + " succesfully created.");

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/***
	 * 
	 * @return true if TABLE=tableName exist.
	 * @param tableName
	 *            tableName
	 */
	public boolean tableExist(String tableName) {
		// System.out.println("  Checking if the TABLE=" + tableName + " exist.");
		try {
			rs = st.executeQuery("SHOW TABLES;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		boolean res = false;
		try {
			while (rs.next()) {
				String str = rs.getString(1);
				if (str.contentEquals(tableName))
					res = true;
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return res;
	}

	/***
	 * Get a list of ids from field in table.
	 * 
	 * @param field
	 *            field
	 * @param table
	 *            table
	 * @return idSet
	 */
	public String[] getIds(String field, String table) {
		int num, i, j;
		String idSet[] = null;

		String str = queryTable("SELECT count(" + field + ") FROM " + table + ";");
		num = Integer.parseInt(str);
		idSet = new String[num];

		try {
			rs = st.executeQuery("SELECT " + field + " FROM " + table + ";");
		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			i = 0;
			while (rs.next()) {
				idSet[i] = rs.getString(1);
				i++;
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}

		return idSet;
	}

	/***
	 * This function will select just the unprocessed cleanText records.
	 * 
	 * @return idSet
	 */
	public int[] getUnprocessedTextIds() {
		int num, i, j;
		int idSet[] = null;

		String str = queryTable("select count(id) from " + cleanTextTableName + " where processed=false;");
		num = Integer.parseInt(str);
		idSet = new int[num];

		try {
			rs = st.executeQuery("select id from " + cleanTextTableName + " where processed=false;");
		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			i = 0;
			while (rs.next()) {
				idSet[i] = rs.getInt(1);
				i++;
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}

		return idSet;
	}

	/***
	 * Get the list of tables for this locale
	 * 
	 * @return ArrayList<String>
	 */
	public ArrayList getListOfTables() {
		ArrayList tablesList = new ArrayList();
		try {
			rs = st.executeQuery("show tables like '" + locale + "%';");
		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			while (rs.next())
				tablesList.add(rs.getString(1));
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return tablesList;
	}

	public void setDBTable(String table) {
		currentTable = table;
	}

	public void setWordListTable(String table) {
		wordListTableName = table;
	}

	public void insertCleanText(String text, String page_id, String text_id) {
		// System.out.println("inserting in cleanText: ");
		byte cleanText[] = null;

		try {
			cleanText = text.getBytes("UTF8");
		} catch (Exception e) { // UnsupportedEncodedException
			e.printStackTrace();
		}

		try {
			// ps = cn.prepareStatement("INSERT INTO cleanText VALUES (null, ?, ?, ?, ?)");
			if (cleanText != null) {
				psCleanText.setBytes(1, cleanText);
				psCleanText.setBoolean(2, false); // it will be true after processed by the FeatureMaker
				psCleanText.setInt(3, Integer.parseInt(page_id));
				psCleanText.setInt(4, Integer.parseInt(text_id));
				psCleanText.execute();
				psCleanText.clearParameters();
			} else
				System.out.println("WARNING: can not insert in " + cleanTextTableName + ": " + text);

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/****
	 * Insert processed sentence in dbselection
	 * 
	 * @param sentence
	 *            text of the sentence.
	 * @param features
	 *            features if sentences is reliable.
	 * @param reliable
	 *            true/false.
	 * @param unknownWords
	 *            true/false.
	 * @param strangeSymbols
	 *            true/false.
	 * @param cleanText_id
	 *            the id of the cleanText this sentence comes from.
	 */
	public void insertSentence(String sentence, byte features[], boolean reliable, boolean unknownWords, boolean strangeSymbols,
			int cleanText_id) {

		byte strByte[] = null;
		try {
			strByte = sentence.getBytes("UTF8");
		} catch (Exception e) { // UnsupportedEncodedException
			e.printStackTrace();
		}

		try {
			psSentence.setBytes(1, strByte);
			psSentence.setBytes(2, features);
			psSentence.setBoolean(3, reliable);
			psSentence.setBoolean(4, unknownWords);
			psSentence.setBoolean(5, strangeSymbols);
			psSentence.setBoolean(6, false);
			psSentence.setBoolean(7, false);
			psSentence.setInt(8, cleanText_id);
			psSentence.execute();

			psSentence.clearParameters();

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/***
	 * With the dbselection_id get first the sentence and then insert it in the locale_selectedSentences table.
	 * 
	 * @param dbselection_id
	 *            dbselection_id
	 * @param unwanted
	 *            unwanted
	 */
	public void insertSelectedSentence(int dbselection_id, boolean unwanted) {

		String dbQuery = "Select sentence FROM " + dbselectionTableName + " WHERE id=" + dbselection_id;
		byte[] sentenceBytes = null;

		try {
			// First get the sentence
			sentenceBytes = queryTableByte(dbQuery);

			psSelectedSentence.setBytes(1, sentenceBytes);
			psSelectedSentence.setBoolean(2, unwanted);
			psSelectedSentence.setInt(3, dbselection_id);
			psSelectedSentence.execute();

			psSelectedSentence.clearParameters();

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	/****
	 * Creates a wordList table, if already exists deletes it and creates a new to insert current wordList.
	 * 
	 * @param wordList
	 *            wordList
	 */
	public void insertWordList(HashMap wordList) {
		String word;
		Integer value;
		boolean res;
		byte wordByte[];

		String wordListTable = "CREATE TABLE " + wordListTableName + " ( id INT NOT NULL AUTO_INCREMENT, "
				+ "word TINYBLOB NOT NULL, " + "frequency INT UNSIGNED NOT NULL, " + "primary key(id)) CHARACTER SET utf8;";

		try {
			System.out.println("Inserting wordList in DB...");
			// if wordList table already exist it should be deleted before inserting this list
			if (tableExist(wordListTableName)) {
				res = st.execute("DROP TABLE " + wordListTableName + ";");
				res = st.execute(wordListTable);
			} else
				res = st.execute(wordListTable);

			try {
				Iterator iteratorSorted = wordList.keySet().iterator();
				int count = 0;

				cn.setAutoCommit(false);

				while (iteratorSorted.hasNext()) {
					word = iteratorSorted.next().toString();
					value = wordList.get(word);
					wordByte = null;
					wordByte = word.getBytes("UTF8");
					psWord.setBytes(1, wordByte);
					psWord.setInt(2, value);
					psWord.addBatch();

					if (count++ == 1000) {
						psWord.executeBatch();
						cn.commit();
						count = 0;
						System.out.println("Adding batch.");
					}

					psWord.clearParameters();
				}

				psWord.executeBatch(); // the leftovers.
				cn.commit();
				cn.setAutoCommit(true);

			} catch (Exception e) { // UnsupportedEncodedException
				e.printStackTrace();
			}
			System.out.println("Inserted new words in " + wordListTableName + " table.");

		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	public void closeDBConnection() {
		try {
			cn.close();
			System.out.println("\nMysql connection closed.");
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	public int getNumberOfReliableSentences() {
		String dbQuery = "SELECT count(sentence) FROM " + dbselectionTableName + " where reliable=true;";
		String str = queryTable(dbQuery);
		return Integer.parseInt(str);
	}

	/***
	 * Get a list of id's
	 * 
	 * @param table
	 *            cleanText, wordList, dbselection (no need to add locale) (NOTE: this function does not work for the
	 *            selectedSentences table, for this table use the function getIdListOfSelectedSentences ).
	 * @param condition
	 *            reliable, unknownWords, strangeSymbols, selected, unwanted = true/false (combined are posible:
	 *            "reliable=true and unwanted=false"); or condition=null for querying without condition.
	 * @return int array or null if the list of id's is empty.
	 */
	public int[] getIdListOfType(String table, String condition) {
		int num, i, j;
		int idSet[] = null;
		int maxNum = 500000;
		String getNum, getIds, getIdsShort;

		if (condition != null) {
			getNum = "SELECT count(id) FROM " + locale + "_" + table + " where " + condition + ";";
			getIds = "SELECT id FROM " + locale + "_" + table + " where " + condition;
		} else {
			getNum = "SELECT count(id) FROM " + locale + "_" + table + ";";
			getIds = "SELECT id FROM " + locale + "_" + table;
		}

		String str = queryTable(getNum);
		num = Integer.parseInt(str);
		// System.out.println("num = " + num);
		if (num > 0) {
			idSet = new int[num];
			i = 0;
			if (num > maxNum) {
				for (j = 0; j < num; j += maxNum) {
					try {
						getIdsShort = getIds + " limit " + j + "," + maxNum;
						// System.out.println("getIdsShort=" + getIdsShort);

						rs = st.executeQuery(getIdsShort);
						while (rs.next()) {
							idSet[i] = rs.getInt(1);
							i++;
						}
						System.out.println("  Num of ids retrieved = " + i);
						rs.close();

					} catch (SQLException e) {
						e.printStackTrace();
					}
				}
			} else { // if num < maxNum
				try {
					rs = st.executeQuery(getIds);
					while (rs.next()) {
						idSet[i] = rs.getInt(1);
						i++;
					}
				} catch (SQLException e) {
					e.printStackTrace();
				}
			}

		} else
			System.out.println("WARNING empty list for: " + getIds);
		// System.out.println("idSet.length=" + idSet.length);
		return idSet;
	}

	/**
	 * For the set of sentences identified by table and condition, retrieve from Mysql both the sentence ids and the corresponding
	 * features.
	 * 
	 * @param table
	 *            table
	 * @param condition
	 *            condition
	 * @return Pair<int[], byte[][]>(idSet, features)
	 */
	public Pair getIdsAndFeatureVectors(String table, String condition) {
		int num, i, j;
		int idSet[] = null;
		byte features[][] = null;
		int maxNum = 500000;
		String getNum, getIds, getIdsShort;

		if (condition != null) {
			getNum = "SELECT count(id) FROM " + locale + "_" + table + " where " + condition + ";";
			getIds = "SELECT id,features FROM " + locale + "_" + table + " where " + condition;
		} else {
			getNum = "SELECT count(id) FROM " + locale + "_" + table + ";";
			getIds = "SELECT id,features FROM " + locale + "_" + table;
		}

		String str = queryTable(getNum);
		num = Integer.parseInt(str);
		System.out.println(num + " sentences to retrieve...");
		// System.out.println("num = " + num);
		if (num > 0) {
			idSet = new int[num];
			features = new byte[num][];
			i = 0;
			if (num > maxNum) {
				for (j = 0; j < num; j += maxNum) {
					try {
						getIdsShort = getIds + " limit " + j + "," + maxNum;

						rs = st.executeQuery(getIdsShort);
						while (rs.next()) {
							idSet[i] = rs.getInt(1);
							features[i] = rs.getBytes(2);
							i++;
						}
						System.out.println("  Num of ids+features retrieved = " + i);
						rs.close();

					} catch (SQLException e) {
						e.printStackTrace();
					}
				}
			} else { // if num < maxNum
				try {
					rs = st.executeQuery(getIds);
					while (rs.next()) {
						idSet[i] = rs.getInt(1);
						features[i] = rs.getBytes(2);
						i++;
					}
				} catch (SQLException e) {
					e.printStackTrace();
				}
			}

		} else
			System.out.println("WARNING empty list for: " + getIds);
		// System.out.println("idSet.length=" + idSet.length);
		return new Pair(idSet, features);
	}

	/***
	 * Get a list of id's from a selected sentences table.
* NOTE: use the actual table name: local + tableName + selectedsentences * * * * @param actualTableName * = locale_tableName_selectedSentences * @param condition * unwanted=true/false * @return idSet */ public int[] getIdListOfSelectedSentences(String actualTableName, String condition) { int num, i, j; int idSet[] = null; String getNum, getIds; // String actualTableName = lang + "_" + tableName + "_selectedSentences"; getNum = "SELECT count(dbselection_id) FROM " + actualTableName + " where " + condition + ";"; getIds = "SELECT dbselection_id FROM " + actualTableName + " where " + condition + ";"; String str = queryTable(getNum); num = Integer.parseInt(str); if (num > 0) { idSet = new int[num]; try { rs = st.executeQuery(getIds); } catch (Exception e) { e.printStackTrace(); } try { i = 0; while (rs.next()) { idSet[i] = rs.getInt(1); i++; } } catch (SQLException e) { e.printStackTrace(); } } else System.out.println("WARNING empty list for: " + getIds); return idSet; } /*** * Get number of words in the wordList table. * * @return int number of words. * @param maxFrequency * max frequency of a word to be considered in the list, if maxFrequency=0 it will retrieve all the words with * frequency≥1. */ public int getNumberOfWords(int maxFrequency) { String where = ""; if (maxFrequency > 0) where = "where frequency > " + maxFrequency; String dbQuery = "SELECT count(word) FROM " + wordListTableName + " " + where + ";"; String str = queryTable(dbQuery); return Integer.parseInt(str); } /**** * Get the most frequent words and its frequency in a HashMap. * * @param numWords * max number of words to retrieve, if numWords=0 then it will retrieve all the words in the list in descending * order of frequency. * @param maxFrequency * max frequency of a word to be considered in the list, if maxFrequency=0 it will retrieve all the words with * frequency≥1. * @return wordList */ public HashMap getMostFrequentWords(int numWords, int maxFrequency) { HashMap wordList; String dbQuery, where = "", word; int initialCapacity = 200000; byte wordBytes[]; if (maxFrequency > 0) where = "where frequency > " + maxFrequency; if (numWords > 0) { dbQuery = "SELECT word,frequency FROM " + wordListTableName + " " + where + " order by frequency desc limit " + numWords + ";"; wordList = new HashMap(numWords); } else { dbQuery = "SELECT word,frequency FROM " + wordListTableName + " " + where + " order by frequency desc"; wordList = new HashMap(initialCapacity); } try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { while (rs.next()) { wordBytes = rs.getBytes(1); word = new String(wordBytes, "UTF8"); wordList.put(word, new Integer(rs.getInt(2))); } } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { // catch unsupported encoding exception e.printStackTrace(); } return wordList; } /**** * Get the most frequent words sorted by frequency (descendent) in an ArrayList. * * @param numWords * max number of words to retrieve, if numWords=0 then it will retrieve all the words in the list in descending * order of frequency. * @param maxFrequency * max frequency of a word to be considered in the list, if maxFrequency=0 it will retrieve all the words with * frequency≥1. * @return words */ public ArrayList getMostFrequentWordsArray(int numWords, int maxFrequency) { ArrayList words = new ArrayList(); ; String dbQuery, where = "", word; int initialCapacity = 200000; byte wordBytes[]; if (maxFrequency > 0) where = "where frequency > " + maxFrequency; if (numWords > 0) dbQuery = "SELECT word,frequency FROM " + wordListTableName + " " + where + " order by frequency desc limit " + numWords + ";"; else dbQuery = "SELECT word,frequency FROM " + wordListTableName + " " + where + " order by frequency desc"; try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { while (rs.next()) { wordBytes = rs.getBytes(1); word = new String(wordBytes, "UTF8"); words.add(word); } } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { // catch unsupported encoding exception e.printStackTrace(); } return words; } /**** * * @param fileName * file to write the list * @param order * word or frequency * @param numWords * max number of words, if numWords=0 then it will retrieve all the words in the list. * @param maxFrequency * max frequency of a word to be considered in the list, if maxFrequency=0 it will retrieve all the words with * frequency≥1. */ public void printWordList(String fileName, String order, int numWords, int maxFrequency) { PrintWriter pw; String dbQuery, where = ""; String orderBy; byte wordBytes[]; String word; if (maxFrequency > 0) where = "where frequency > " + maxFrequency; if (order.contentEquals("word")) orderBy = "word asc"; else orderBy = "frequency desc"; if (numWords > 0) dbQuery = "SELECT word,frequency from " + wordListTableName + " " + where + " order by " + orderBy + " limit " + numWords + ";"; else dbQuery = "SELECT word,frequency from " + wordListTableName + " " + where + " order by " + orderBy; try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File(fileName)), "UTF-8")); wordBytes = null; while (rs.next()) { wordBytes = rs.getBytes(1); word = new String(wordBytes, "UTF8"); pw.println(word + " " + rs.getInt(2)); } pw.close(); } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { // catch unsupported encoding exception e.printStackTrace(); } System.out.println(wordListTableName + " printed in file: " + fileName + " ordered by " + order); } /*** * Get a sentence from a locale_dbselection table. * * @param id * dbselection (no need to add locale) * @return String sentence */ public String getDBSelectionSentence(int id) { String sentence = ""; String dbQuery = "Select sentence FROM " + dbselectionTableName + " WHERE id=" + id; byte[] sentenceBytes = null; sentenceBytes = queryTableByte(dbQuery); try { sentence = new String(sentenceBytes, "UTF8"); // System.out.println(" TEXT: " + text); } catch (Exception e) { // UnsupportedEncodedException e.printStackTrace(); } return sentence; } /** * Get a * * @param tableName * tableName * @param id * id * @return sentence */ public String getSelectedSentence(String tableName, int id) { String sentence = ""; String dbQuery = "Select sentence FROM " + tableName + " WHERE dbselection_id=" + id; byte[] sentenceBytes = null; sentenceBytes = queryTableByte(dbQuery); try { sentence = new String(sentenceBytes, "UTF8"); // System.out.println(" TEXT: " + text); } catch (Exception e) { // UnsupportedEncodedException e.printStackTrace(); } return sentence; } // Firts filtering: // get first the page_title and check if it is not Image: or Wikipedia:Votes_for_deletion/ // maybe we can check also the length public String getTextFromWikiPage(String id, int minPageLength, StringBuilder old_id, PrintWriter pw) { String pageTitle, pageLen, dbQuery, textId, text = null; byte[] textBytes = null; int len; dbQuery = "Select page_title FROM " + locale + "_page WHERE page_id=" + id; pageTitle = queryTable(dbQuery); dbQuery = "Select page_len FROM " + locale + "_page WHERE page_id=" + id; pageLen = queryTable(dbQuery); len = Integer.parseInt(pageLen); if (len < minPageLength || pageTitle.contains("Wikipedia:") || pageTitle.contains("Image:") || pageTitle.contains("Template:") || pageTitle.contains("Category:") || pageTitle.contains("List_of_")) { // System.out.println("PAGE NOT USED page title=" + pageTitle + " Len=" + len); /* * dbQuery = "select rev_text_id from revision where rev_page=" + id; textId = queryTable(dbQuery); dbQuery = * "select old_text from text where old_id=" + textId; text = queryTable(dbQuery); System.out.println("TEXT: " + * text); */ } else { // System.out.println("PAGE page_id=" + id + " PAGE SELECTED page title=" + pageTitle + " Len=" + len); if (pw != null) pw.println("\nSELECTED PAGE TITLE=" + pageTitle + " Len=" + len); dbQuery = "select rev_text_id from " + locale + "_revision where rev_page=" + id; textId = queryTable(dbQuery); old_id.delete(0, old_id.length()); old_id.insert(0, textId); dbQuery = "select old_text from " + locale + "_text where old_id=" + textId; textBytes = queryTableByte(dbQuery); try { text = new String(textBytes, "UTF8"); // System.out.println(" TEXT: " + text); } catch (Exception e) { // UnsupportedEncodedException e.printStackTrace(); } } return text; } public String getCleanText(int id) { String dbQuery, text = null; byte[] textBytes = null; dbQuery = " select cleanText from " + cleanTextTableName + " where id=" + id; textBytes = queryTableByte(dbQuery); try { text = new String(textBytes, "UTF8"); // System.out.println(" TEXT: " + text); } catch (Exception e) { // UnsupportedEncodedException e.printStackTrace(); } // once retrieved the text record mark it as processed updateTable("UPDATE " + cleanTextTableName + " SET processed=true WHERE id=" + id); return text; } /*** * Set a sentence record field as true/false in dbselection table. * * @param id * id * @param field * reliable, unknownWords, strangeSymbols, selected or unwanted = true/false * @param fieldValue * true/false (as string) */ public void setSentenceRecord(int id, String field, boolean fieldValue) { String bval; if (fieldValue) bval = "true"; else bval = "false"; updateTable("UPDATE " + dbselectionTableName + " SET " + field + "=" + bval + " WHERE id=" + id); } /*** * This function updates the unwanted field as true/false of dbselection TABLE and selectedSentencesTable TABLE.
* NOTE: use the actual table name: local + tableName + selectedsentences * * @param actualTableName * including local and _selectedSentences * @param id * id in dbselection table * @param fieldValue * true/false */ public void setUnwantedSentenceRecord(String actualTableName, int id, boolean fieldValue) { String bval; if (fieldValue) bval = "true"; else bval = "false"; updateTable("UPDATE " + actualTableName + " SET unwanted=" + bval + " WHERE dbselection_id=" + id); updateTable("UPDATE " + dbselectionTableName + " SET unwanted=" + bval + " WHERE id=" + id); } /*** * Set a description for table = name, it checks if the table tablesDescription exist, if not it creates it. * * @param tableName * the name of the table, it can not be null * @param description * if no description set to null * @param stopCriterion * if no stopCriterion set to null * @param featuresDefinitionFileName * if no featuresDefinitionFileName set to null * @param covDefConfigFileName * if no covDefConfigFileNamen set to null */ public void setTableDescription(String tableName, String description, String stopCriterion, String featuresDefinitionFileName, String covDefConfigFileName) { boolean descExists = false; int val = 0; if (tableName != null) { // check if tablesDescription exists if (tableExist("tablesDescription")) { // check if a description for that name already exist try { rs = st.executeQuery("SELECT id from tablesDescription where name='" + tableName + "';"); while (rs.next()) { val = rs.getInt(1); if (val > 0) descExists = true; } } catch (SQLException e) { e.printStackTrace(); } } else createTablesDescriptionTable(); if (!descExists) { try { System.out.println(" Adding a description for the table " + tableName + " in TABLE = tablesDescription."); psTablesDescription.setString(1, tableName); if (description != null) psTablesDescription.setString(2, description); else psTablesDescription.setString(2, ""); if (stopCriterion != null) psTablesDescription.setString(3, stopCriterion); else psTablesDescription.setString(3, ""); // get the file and the content of the file if (featuresDefinitionFileName != null) { psTablesDescription.setString(4, featuresDefinitionFileName); String str = "", features = ""; try { BufferedReader in = new BufferedReader(new FileReader(featuresDefinitionFileName)); while ((str = in.readLine()) != null) features += str + "\n"; in.close(); } catch (IOException e) { } psTablesDescription.setString(5, features); } else { psTablesDescription.setString(4, ""); psTablesDescription.setString(5, ""); } // get the file and the content of the file if (covDefConfigFileName != null) { psTablesDescription.setString(6, covDefConfigFileName); String str = "", config = ""; try { BufferedReader in = new BufferedReader(new FileReader(covDefConfigFileName)); while ((str = in.readLine()) != null) config += str + "\n"; in.close(); } catch (IOException e) { } psTablesDescription.setString(7, config); } else { psTablesDescription.setString(6, ""); psTablesDescription.setString(7, ""); } psTablesDescription.execute(); psTablesDescription.clearParameters(); } catch (SQLException e) { e.printStackTrace(); } } else // if a description for this name already exist System.out.println(" A description for the table " + tableName + " already exist in TABLE = tablesDescription."); } else // tableName can not be null System.out.println(" Error setting table description: tableName can not be null"); } /*** * Get the description of the tableName * * @param tableName * tableName * @return and String array where: desc[0] tableName desc[1] description desc[2] stopCriterion desc[3] * featuresDefinitionFileName desc[4] featuresDefinitionFile desc[5] covDefConfigFileName desc[6] covDefConfigFile */ public String[] getTableDescription(String tableName) { String[] desc = new String[7]; PreparedStatement psDesc = null; try { psDesc = cn.prepareStatement("SELECT * from tablesDescription where name='" + tableName + "';"); rs = psDesc.executeQuery(); while (rs.next()) { for (int i = 2; i < 9; i++) { desc[(i - 2)] = rs.getString(i); // the id is not returned // System.out.println("desc[" + i + "]=" + desc[(i-2)]); } } psDesc.close(); } catch (SQLException e) { e.printStackTrace(); } return desc; } private boolean updateTable(String sql) { String str = ""; boolean res = false; try { res = st.execute(sql); } catch (Exception e) { e.printStackTrace(); } return res; } private String queryTable(String dbQuery) { String str = ""; // String dbQuery = "Select * FROM " + currentTable; // System.out.println("querying: " + dbQuery); try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { while (rs.next()) { // String url = rs.getString(2); // str = rs.getString(field); str = rs.getString(1); } } catch (SQLException e) { e.printStackTrace(); } return str; } private byte[] queryTableByte(String dbQuery) { byte strBytes[] = null; // String dbQuery = "Select * FROM " + currentTable; // System.out.println("querying: " + dbQuery); try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { while (rs.next()) { // String url = rs.getString(2); // str = rs.getString(field); // str = rs.getString(1); strBytes = rs.getBytes(1); } } catch (Exception e) { e.printStackTrace(); } return strBytes; } public byte[] getFeatures(int id) { byte[] fea = null; String dbQuery = "Select features FROM " + dbselectionTableName + " WHERE id=" + id; // System.out.println("querying: " + dbQuery); try { rs = st.executeQuery(dbQuery); } catch (Exception e) { e.printStackTrace(); } try { while (rs.next()) { fea = rs.getBytes(1); } } catch (SQLException e) { e.printStackTrace(); } return fea; } /** * Bulk load a set of features as identified by their IDs. * * @param ids * a sorted array of feature IDs. * @return an array of coverage features, of the same length as the input array. */ public byte[][] getFeaturesBulk(int[] ids) { HashMap featuresSet = getFeaturesSet(0, ids.length - 1, ids); byte[][] data = new byte[ids.length][]; for (int i = 0; i < ids.length; i++) { data[i] = featuresSet.get(ids[i]); if (data[i] == null) { throw new NullPointerException("Could not get features for sentence ID " + ids[i]); } } return data; } public HashMap getFeaturesSet(int ini, int end, int[] idList) { int id; int iniId = idList[ini]; int endId = idList[end]; byte[] f; PreparedStatement psFeaturesSet = null; boolean IdInRange = iniId > 0 && endId > 0; int initialCapacity = IdInRange ? endId - iniId : end - ini; HashMap feas = new HashMap(initialCapacity); try { if (IdInRange) { psFeaturesSet = cn.prepareStatement("SELECT id,features FROM " + dbselectionTableName + " WHERE reliable=true and selected=false and unwanted=false and id>=? and id<=?"); psFeaturesSet.setInt(1, iniId); psFeaturesSet.setInt(2, endId); rs = psFeaturesSet.executeQuery(); while (rs.next()) { id = rs.getInt(1); f = rs.getBytes(2); feas.put(id, f); // System .out.println("adding id=" + id); } } else { // one or both ids are negative so we need to retrieve one by one System.out.println("getFeaturesSet: negative indexes retrieving the features one by one...."); psFeaturesSet = cn.prepareStatement("Select features FROM " + dbselectionTableName + " WHERE id=?"); for (int i = ini; i <= end; i++) { if (idList[i] > 0) { try { psFeaturesSet.setInt(1, idList[i]); rs = psFeaturesSet.executeQuery(); while (rs.next()) { f = rs.getBytes(1); feas.put(idList[i], f); // System .out.println("adding id=" + id); } } catch (SQLException e) { e.printStackTrace(); } } } // end for loop } // end else, the indexes are negative } catch (SQLException e) { e.printStackTrace(); } finally { FileUtils.close(psFeaturesSet); } return feas; } /** * The following characteres should be escaped: \0 An ASCII 0 (NUL) character. \' A single quote (“'”) character. * \" A double quote (“"”) character. * * @param str * str * @return str */ public String mysqlEscapeCharacters(String str) { str = str.replace("\0", ""); str = str.replace("'", "\'"); str = str.replace("\"", "\\\""); return str; } public static void main(String[] args) throws Exception { DBHandler wikiDB = new DBHandler("es"); wikiDB.createDBConnection(args[0], args[1], args[2], args[3]); int numWords = wikiDB.getNumberOfWords(0); System.out.println("numWords=" + numWords); // wikiDB.printWordList("./tmp.txt", "frequency", 0, 0); /* * ArrayList w = new ArrayList(); HashMap wordList = wikiDB.getMostFrequentWords(0, 0, * w); * * for(int i=0; i w = wikiDB.getMostFrequentWordsArray(0, 0); for (int i = 0; i < w.size(); i++) System.out.println(w.get(i)); wikiDB.closeDBConnection(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy