Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.tools.dbselection;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Vector;
import org.apache.commons.lang.StringEscapeUtils;
/**
* WikipediaMarkupCleaner
*
* @author Marcela Charfuelan.
*/
public class WikipediaMarkupCleaner {
// locale
private String locale = null;
// mySql database
private String mysqlHost = null;
private String mysqlDB = null;
private String mysqlUser = null;
private String mysqlPasswd = null;
// Wikipedia files:
private String xmlWikiFile = null;
private String wikiLog = null;
private boolean debug = false;
private String debugPageId = null;
// Default settings for max page length and min and max text length
private int minPageLength = 10000; // minimum size of a wikipedia page, to be used in the first filtering of pages
private int minTextLength = 1000;
private int maxTextLength = 15000; // the average length in one big xml file is approx. 12000
// Use this variable to save time not loading Wiki tables, if they already exist in the DB
private boolean loadWikiTables = true;
// Use this variable to do NOT create a new cleanText table, but adding to an already existing cleanText table.
private boolean deleteCleanTextTable = true;
public void setLocale(String str) {
locale = str;
}
public void setMysqlHost(String str) {
mysqlHost = str;
}
public void setMysqlDB(String str) {
mysqlDB = str;
}
public void setMysqlUser(String str) {
mysqlUser = str;
}
public void setMysqlPasswd(String str) {
mysqlPasswd = str;
}
public void setXmlWikiFile(String str) {
xmlWikiFile = str;
}
public void setWikiLog(String str) {
wikiLog = str;
}
public void setTestId(String str) {
debugPageId = str;
}
public void setMinPageLength(int val) {
minPageLength = val;
}
public void setMinTextLength(int val) {
minTextLength = val;
}
public void setMaxTextLength(int val) {
maxTextLength = val;
}
public void setDebug(boolean bval) {
debug = bval;
}
public void setLoadWikiTables(boolean bval) {
loadWikiTables = bval;
}
public void setDeleteCleanTextTable(boolean bval) {
deleteCleanTextTable = bval;
}
public String getLocale() {
return locale;
}
public String getMysqlHost() {
return mysqlHost;
}
public String getMysqlDB() {
return mysqlDB;
}
public String getMysqlUser() {
return mysqlUser;
}
public String getMysqlPasswd() {
return mysqlPasswd;
}
public String getXmlWikiFile() {
return xmlWikiFile;
}
public String getWikiLog() {
return wikiLog;
}
public String getTestId() {
return debugPageId;
}
public int getMinPageLength() {
return minPageLength;
}
public int getMinTextLength() {
return minTextLength;
}
public int getMaxTextLength() {
return maxTextLength;
}
public boolean getDebug() {
return debug;
}
public boolean getLoadWikiTables() {
return loadWikiTables;
}
public boolean getDeleteCleanTextTable() {
return deleteCleanTextTable;
}
public Vector removeMarkup(String page) {
StringBuffer str = new StringBuffer("");
StringBuffer line = null;
Vector textList = new Vector();
boolean endOfText = false;
Scanner s = null;
try {
s = new Scanner(page);
while (s.hasNext() && !endOfText) {
line = new StringBuffer(s.nextLine());
// process text until it finds any of these labels:
if (line.indexOf("==References") >= 0 || line.indexOf("== References") >= 0 || line.indexOf("==See also") >= 0
|| line.indexOf("== See also") >= 0 || line.indexOf("==External links and sources") >= 0
|| line.indexOf("==External links") >= 0 || line.indexOf("== External links") >= 0
|| line.indexOf("== External Links") >= 0 || line.indexOf("== External links and sources") >= 0
|| line.indexOf("==Notes") >= 0 || line.indexOf("== Notes") >= 0 || line.indexOf("==Sources") >= 0
|| line.indexOf("== Sources") >= 0 || line.indexOf("==Foreign") >= 0 || line.indexOf("== Foreign") >= 0
|| line.indexOf("==Discussion") >= 0) {
endOfText = true;
} else {
// when removing sections it might add more lines that might contain again more labels to remove
boolean clean = false;
while (!clean && line.length() > 0) {
clean = true;
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("
= 0) { // tables
line = removeSection(s, line, "
");
clean = false;
}
if (line.indexOf("
= 0) {
line = removeSection(s, line, "
");
clean = false;
}
if (line.indexOf("{{col-begin}}") >= 0) {
line = removeSection(s, line, "{{col-begin}}", "{{col-end}}");
clean = false;
}
if (line.indexOf("{|") >= 0) { // this is a table, this should go before {{ because a table can contain {{
// }}
line = removeSectionTable(s, line, "{|", "|}");
clean = false;
}
if (line.indexOf("= 0) { // references
line = removeSectionRef(s, line); // This is special because it can be , or />
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("{{start box}}") >= 0) {
line = removeSection(s, line, "{{start box}}", "{{end box}}");
clean = false;
}
if (line.indexOf("{{") >= 0) {
line = removeSection(s, line, "{{", "}}");
clean = false;
}
if (line.indexOf("");
clean = false;
}
if (line.indexOf("\\mathrel{|") >= 0) {
line = removeSection(s, line, "\\mathrel{|", "}");
clean = false;
}
if (line.indexOf("= 0) { // gallery might contain several images
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("[[Image:") >= 0) {
line = removeSectionImage(s, line, "[[Image:", "]]");
clean = false;
}
if (line.indexOf("
= 0) { // span and div tags are used to separate images from text
line = removeSection(s, line, "
");
clean = false;
}
if (line.indexOf("
= 0) {
line = removeSectionImage(s, line, "
");
clean = false;
}
if (line.indexOf("= 0) {
line = removeSection(s, line, "");
clean = false;
}
if (line.indexOf("
", ""));
line = new StringBuffer(line.toString().replaceAll("", ""));
line = new StringBuffer(line.toString().replaceAll("/>", ""));
// Removing quotation marks
line = new StringBuffer(line.toString().replaceAll("\"", ""));
// these quotations have a strange/problematic symbol different from "
line = new StringBuffer(line.toString().replaceAll("“", ""));
line = new StringBuffer(line.toString().replaceAll("”", ""));
// these symbol are also problematic, here they are changed.
line = new StringBuffer(line.toString().replaceAll("’", "'"));
line = new StringBuffer(line.toString().replaceAll("—", "-"));
line = new StringBuffer(line.toString().replaceAll("–", "-"));
line = new StringBuffer(line.toString().replaceAll(" ", " "));
line = new StringBuffer(line.toString().replaceAll("…", " "));
// finally sections and lists
boolean is_title = false;
if (line.toString().startsWith("==")) {
is_title = true;
}
line = new StringBuffer(line.toString().replaceAll("\\s*==+$|==+", ""));
if (is_title) {
line.append(".");
}
// bulleted list and numbered list
if (line.toString().startsWith("***") || line.toString().startsWith("*#*"))
line.replace(0, 3, "");
if (line.toString().startsWith("**") || line.toString().startsWith(":*")
|| line.toString().startsWith("*#") || line.toString().startsWith("##")
|| line.toString().startsWith("::"))
line.replace(0, 2, "");
if (line.toString().startsWith("*") || line.toString().startsWith("#"))
line.replace(0, 1, "");
if (line.toString().startsWith(";") || line.toString().startsWith(";")) // in glossaries definitions start
// with ;
line.replace(0, 1, "");
// remove this when the text is almost clean
if (line.indexOf("= 0)
line = removeSection(s, line, "");
line = new StringBuffer(line.toString().replaceAll("", ""));
if (line.indexOf("
= 0)
line = removeSection(s, line, "
");
if (line.indexOf("= 0)
line = removeSection(s, line, "");
if (line.indexOf("= 0)
line = removeSection(s, line, "");
// finally concatenate the line
str.append(line);
if (!str.toString().endsWith("\n"))
str.append("\n");
line = null;
// check length of the text
if (str.length() > maxTextLength) {
textList.add(str.toString());
// System.out.println("\n-----------\n" + str.toString());
str = new StringBuffer("");
}
}
} // endOfText=false
} // while has more lines
} finally {
if (s != null)
s.close();
}
if (!str.toString().contentEquals(""))
textList.add(str.toString());
return textList;
}
// This is special because it can be:
// ...
//
//
private StringBuffer removeSectionRef(Scanner s, StringBuffer lineIn) {
String next;
int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0;
boolean closeRef = true;
StringBuffer line = new StringBuffer(lineIn);
StringBuffer nextLine;
while ((index1 = line.indexOf("= 0) { // in one line can be more than one reference
numRef++;
if ((index2 = line.indexOf("", index1)) >= 0)
endTagLength = 6 + index2;
else if ((index3 = line.indexOf("/>", index1)) >= 0)
endTagLength = 2 + index3;
if (index2 == -1 && index3 == -1) {// the most be in the next lines, so get more lines until the is
// found
while (s.hasNext() && numRef != 0) {
nextLine = new StringBuffer(s.nextLine());
if (nextLine.indexOf("= 0)
numRef++;
line.append(nextLine);
if ((index2 = line.indexOf("", index1)) >= 0) {
numRef--;
endTagLength = 6 + index2;
} else if ((index3 = line.indexOf("/>", index1)) >= 0) {
numRef--;
endTagLength = 2 + index3;
}
}
} else
// the endTag was found
numRef--;
if (numRef == 0) {
index1 = line.indexOf(" index1) {
line.delete(index1, endTagLength);
// System.out.println("nextline="+line);
} else {
if (debug) {
System.out.print("iniTag: length of line: " + line);
// line.delete(index1, line.length());
}
line = new StringBuffer("");
}
} else {
if (debug)
System.out.println("removeSectionRef: WARNING no or /> in " + line);
// line.delete(index1, line.length());
line = new StringBuffer("");
}
} // while this line contains iniTag-s
return line;
}
private StringBuffer removeSection(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
String next;
int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
boolean closeRef = true;
StringBuffer line = new StringBuffer(lineIn);
StringBuffer nextLine;
if (debug)
System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line);
while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag
numRef++;
if ((index2 = line.indexOf(endTag, index1)) >= 0)
endTagLength = endTag.length() + index2;
if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
lastEndTag = 0; // start to look for the endTag in 0
while (s.hasNext() && numRef != 0) {
lastIniTag = 0;
nextLine = new StringBuffer(s.nextLine());
// if(debug)
// System.out.println(" NEXTLINE: " + nextLine);
while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
numRef++;
lastIniTag = iniTag.length() + index1;
}
line.append(nextLine);
// next time it will look for the endTag after the position of the last it found.
while ((index2 = line.indexOf(endTag, lastEndTag)) >= 0) {
numRef--;
lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found
endTagLength = endTag.length() + index2;
}
// if(debug)
// System.out.println("LINE (numRef=" + numRef + "): " + line);
}
} else
// the endTag was found
numRef--;
if (numRef == 0) {
index1 = line.indexOf(iniTag); // get again this because the position might change
if (endTagLength > index1) {
if (debug) {
System.out.println(" FINAL LINE: " + line);
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println(" line.length=" + line.length());
}
line.delete(index1, endTagLength);
} else {
if (debug) {
System.out.println("removeSection: WARNING endTagLength > length of line: ");
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
}
line = new StringBuffer("");
}
// System.out.println("nextline="+line);
} else {
if (debug)
System.out.println("removeSection: WARNING no " + endTag);
line = new StringBuffer("");
}
} // while this line contains iniTag-s
if (debug)
System.out.println(" LINE (AFTER): " + line);
return line;
}
private StringBuffer removeSectionTable(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
String next;
int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
boolean closeRef = true;
StringBuffer line = new StringBuffer(lineIn);
StringBuffer nextLine;
if (debug)
System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line);
while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag
numRef++;
if ((index2 = line.indexOf(endTag, index1)) >= 0)
endTagLength = endTag.length() + index2;
if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
lastEndTag = 0; // start to look for the endTag in 0
while (s.hasNext() && numRef != 0) {
lastIniTag = 0;
nextLine = new StringBuffer(s.nextLine());
// if(debug)
// System.out.println(" NEXTLINE: " + nextLine);
while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
numRef++;
lastIniTag = iniTag.length() + index1;
}
// next time it will look for the endTag after the position of the last it found.
// while( (index2 = line.indexOf(endTag, lastEndTag)) >= 0 ){
if (nextLine.toString().startsWith(endTag)) {
numRef--;
// index2 = line.length();
// lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found
endTagLength = line.length() + endTag.length();
}
line.append(nextLine);
// if(debug)
// System.out.println("LINE (numRef=" + numRef + "): " + line);
}
} else
// the endTag was found
numRef--;
if (numRef == 0) {
index1 = line.indexOf(iniTag); // get again this because the position might change
if (endTagLength > index1) {
if (debug) {
System.out.println(" FINAL LINE: " + line);
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println(" line.length=" + line.length());
}
line.delete(index1, endTagLength);
} else {
if (debug) {
System.out.println("removeSection: WARNING endTagLength > length of line: ");
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
}
line = new StringBuffer("");
}
// System.out.println("nextline="+line);
} else {
if (debug)
System.out.println("removeSection: WARNING no " + endTag);
line = new StringBuffer("");
}
} // while this line contains iniTag-s
if (debug)
System.out.println(" LINE (AFTER): " + line);
return line;
}
/****
* This is also special because the line might contain sections with [[ ... ]] so the ]] after a [[ is not the endTag of
* [[image: ... ]]
*
* @param s
* s
* @param lineIn
* lineIn
* @param iniTag
* iniTag
* @param endTag
* endTag
* @return line
*/
private StringBuffer removeSectionImage(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
String next;
int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0, lastEndTag1 = 0, lastIniTag = 0;
boolean closeRef = true;
StringBuffer line = new StringBuffer(lineIn);
StringBuffer nextLine;
StringBuffer aux;
if (debug)
System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line);
while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag
numRef++;
index3 = endTagLength = index1;
while (s.hasNext() && numRef > 0) {
while ((index2 = line.indexOf("]]", endTagLength)) >= 0 && numRef > 0) {
aux = new StringBuffer(line.subSequence(index1 + 2, index2 + 2));
if (debug)
System.out.println(" aux=" + aux);
if ((index3 = aux.indexOf("[[")) == -1) {
endTagLength = endTag.length() + index2;
numRef--;
} else { // The previous was a [[ ]] inside of a [[Image: so it has to be deleted
index1 = index2;
endTagLength = index2 + 2;
index2 = -1;
}
}
// so far it has not found the endTag, so get another line
if (numRef > 0)
line.append(s.nextLine());
}
if (numRef == 0) {
index1 = line.indexOf(iniTag); // get again this because the position might change
if (endTagLength > index1) {
if (debug) {
System.out.println(" FINAL LINE: " + line);
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println(" line.length=" + line.length());
}
line.delete(index1, endTagLength);
} else {
if (debug) {
System.out.println("removeSection: WARNING endTagLength > length of line: ");
System.out.print("iniTag: " + iniTag + " index1=" + index1);
System.out.print(" endTagLength=" + endTagLength);
System.out.println(" line.length=" + line.length() + " line: " + line);
System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
}
line = new StringBuffer("");
}
} else {
if (debug)
System.out.println("removeSection: WARNING no " + endTag);
line = new StringBuffer("");
}
} // while this line contains iniTag-s
if (debug)
System.out.println(" LINE (AFTER): " + line);
return line;
}
/***
* Internal links: [[Name of page]] [[Name of page|Text to display]] External links: [http://www.example.org Text to display]
* [http://www.example.org] http://www.example.org
*
* @param line
* line
*/
private StringBuffer processInternalAndExternalLinks(StringBuffer line) {
int index1, index2, index3;
StringBuffer linetmp = null; // for debugging
boolean changed = false;
if (debug)
linetmp = new StringBuffer(line);
// Internal links:
while ((index1 = line.indexOf("[[")) >= 0) {
changed = true;
if ((index2 = line.indexOf("]]")) >= 0) {
if ((index3 = line.indexOf("|", index1)) >= 0 && index3 < index2) { // if there is text to display
line.delete(index1, index3 + 1); // delete the link and [[ ]]
index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
line.delete(index2, index2 + 2);
} else {
line.delete(index1, index1 + 2); // delete the [[
index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
line.delete(index2, index2 + 2); // delete the ]] -2 because in the previous line i deleted two chars
}
// if(debug)
// System.out.println("LINE (AFTER): " + line);
} else {
if (debug) {
System.out.println("processInternalAndExternalLinks: WARNING no ]] tag in " + line);
System.out.println("deleting [[");
}
line.delete(index1, index1 + 2); // delete the [[
}
}
// External links: just the ones started with [http: and here I am deleting the whole reference
// i am not keeping the text to display of this link.
while ((index1 = line.indexOf("[http:")) >= 0 || (index1 = line.indexOf("[https:")) >= 0) {
// System.out.println("LINE(BEFORE): " + line);
if ((index2 = line.indexOf("]", index1)) >= 0) {
// line.delete(index1, index2+1);
if ((index3 = line.indexOf(" ", index1)) >= 0 && index3 < index2) { // if there is text to display
line.delete(index1, index3 + 1); // delete the link and [http: until first black space before ]
index2 = line.indexOf("]"); // since i delete some text i need to find again the next ]]
line.delete(index2, index2 + 1);
} else {
line.delete(index1, index2 + 1); // no text to display, delete the whole ref
}
// System.out.println("LINE (AFTER): " + line + "\n");
} else {
if (debug) {
System.out.println("processInternalAndExternalLinks: WARNING no ] tag when processing lines with http: line="
+ line);
System.out.println("deleting [");
}
line.delete(index1, index1 + 1); // delete the [
}
}
if (debug && changed) {
System.out.println("Removing links, LINE(BEFORE): " + linetmp);
System.out.println(" LINE (AFTER): " + line);
}
return line;
}
public void addWordToHashMap(String text, HashMap wordList) {
String sentences[];
String words[], w;
Integer i;
int m, n;
sentences = text.split("\n");
for (m = 0; m < sentences.length; m++) {
// System.out.println("\n" + sentences[m]);
words = sentences[m].split(" ");
for (n = 0; n < words.length; n++) {
w = words[n];
// System.out.print("word=" + words[n] + " -->");
// Split into letter sections that we will consider atomic "words":
int start = 0, end = 0;
int minimumLength = 2;
for (; end < w.length(); end++) {
// if (Character.isLetter(w.charAt(end))) {
if (marytts.util.string.StringUtils.isLetterOrModifier(w.codePointAt(end))) {
if (start < 0)
start = end;
continue;
}
// not a letter
if (start >= 0 && end - start >= minimumLength) {
String oneWord = w.substring(start, end);
// System.out.print(" oneWord1=" + oneWord);
Integer count = (Integer) wordList.get(oneWord);
// if key is not in the map then give it value one
// otherwise increment its value by 1
if (count == null)
wordList.put(oneWord, new Integer(1));
else
wordList.put(oneWord, new Integer(count.intValue() + 1));
}
start = -1;
}
if (start >= 0 && end - start >= minimumLength) {
String oneWord = w.substring(start, end);
// System.out.print(" oneWord2=" + oneWord);
Integer count = (Integer) wordList.get(oneWord);
// if key is not in the map then give it value one
// otherwise increment its value by 1
if (count == null)
wordList.put(oneWord, new Integer(1));
else
wordList.put(oneWord, new Integer(count.intValue() + 1));
}
/*
* // remove punctuation if( w.endsWith(",") || w.endsWith(";") || w.endsWith(".") || w.endsWith(":") ||
* w.endsWith("'") || w.endsWith(")") || w.endsWith("?") ) w = w.substring(0, (w.length()-1)); if(
* w.endsWith("'s") ) w = w.substring(0, (w.length()-2)); if(w.startsWith("(") ) w = w.substring(1, w.length());
*
* if( w.length()>1 && StringUtils.isAlpha(w) && StringUtils.isNotBlank(w) && StringUtils.isNotEmpty(w) &&
* StringUtils.isAsciiPrintable(w)) { //System.out.print(w + " "); i = (Integer) wordList.get(w); // if key is not
* in the map then give it value one // otherwise increment its value by 1 if(i==null) wordList.put(w, new
* Integer(1)); else wordList.put(w, new Integer( i.intValue() + 1)); } // if word is > 1 and isAlpha
*/
// System.out.println("\n");
}
// System.out.println("\n");
words = null;
}
sentences = null;
}
public void updateWordList(DBHandler wikiToDB, HashMap wlNew) {
String w;
HashMap wlOld;
Integer freq;
Integer i;
// Checking if word list exist
if (wikiToDB.tableExist(locale + "_wordList")) {
System.out.println("Updating " + locale + "_wordList in DB table....");
wlOld = wikiToDB.getMostFrequentWords(0, 0);
// combine the two tables
Iterator iterator = wlNew.keySet().iterator();
while (iterator.hasNext()) {
w = iterator.next().toString();
freq = wlNew.get(w);
i = (Integer) wlOld.get(w);
// if key is not in the map then give it value freq
// otherwise increment its value by freq
if (i == null)
wlOld.put(w, new Integer(freq));
else
wlOld.put(w, new Integer(i.intValue() + freq));
}
wikiToDB.insertWordList(wlOld);
System.out.println("Final size of wordList after combining old and new lists: wordList=[" + wlOld.size() + "]");
} else {
System.out.println("Saving " + locale + "_wordList table....");
wikiToDB.insertWordList(wlNew);
}
}
void processWikipediaSQLTablesDebug() throws Exception {
DBHandler wikiToDB = new DBHandler(locale);
wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
String text;
StringBuilder textId = new StringBuilder();
int numPagesUsed = 0;
PrintWriter pw = null;
if (wikiLog != null)
pw = new PrintWriter(new FileWriter(new File(wikiLog)));
// get text from the DB
text = wikiToDB.getTextFromWikiPage(debugPageId, minPageLength, textId, pw);
System.out.println("\nPAGE SIZE=" + text.length() + " text:\n" + text);
Vector textList;
if (text != null) {
textList = removeMarkup(text);
System.out.println("\nCLEANED TEXT:");
for (int i = 0; i < textList.size(); i++)
System.out.println("text(" + i + "): \n" + textList.get(i));
} else
System.out.println("NO CLEANED TEXT.");
if (pw != null)
pw.close();
wikiToDB.closeDBConnection();
}
/***
* Using mwdumper extracts pages from a xmlWikiFile and load them in a mysql DB (it loads the tables "locale_text",
* "locale_page" and "locale_revision", where locale is the corresponding wikipedia language). Once the tables are loaded,
* extract/clean text from the pages and create a cleanText table. It also creates a wordList table including frequencies.
*
* @throws Exception
* Exception
*/
void processWikipediaPages() throws Exception {
// Load wikipedia pages, extract clean text and create word list.
String dateStringIni = "", dateStringEnd = "";
DateFormat fullDate = new SimpleDateFormat("dd_MM_yyyy_HH:mm:ss");
Date dateIni = new Date();
dateStringIni = fullDate.format(dateIni);
DBHandler wikiToDB = new DBHandler(locale);
// hashMap for the dictionary, HashMap is faster than TreeMap so the list of words will
// be kept it in a hashMap. When the process finish the hashMap will be dump in the database.
HashMap wordList;
System.out.println("Creating connection to DB server...");
wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
// This loading can take a while
// create and load TABLES: page, text and revision
if (loadWikiTables) {
System.out.println("Creating and loading TABLES: page, text and revision. (The loading can take a while...)");
wikiToDB.loadPagesWithMWDumper(xmlWikiFile, locale, mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
} else {
// Checking if tables are already created and loaded in the DB
if (wikiToDB.checkWikipediaTables())
System.out.println("TABLES " + locale + "_page, " + locale + "_text and " + locale
+ "_revision already loaded (WARNING USING EXISTING WIKIPEDIA TABLES).");
else
throw new Exception("WikipediaMarkupCleaner: ERROR IN TABLES " + locale + "_page, " + locale + "_text and "
+ locale + "_revision, they are not CREATED/LOADED.");
}
System.out.println("\nGetting page IDs");
String pageId[];
pageId = wikiToDB.getIds("page_id", locale + "_page");
System.out.println("Number of page IDs to process: " + pageId.length + "\n");
// create cleanText TABLE
if (deleteCleanTextTable) {
System.out.println("Creating (deleting if already exist) " + locale + "_cleanText TABLE");
wikiToDB.createWikipediaCleanTextTable();
} else {
if (wikiToDB.tableExist(locale + "_cleanText"))
System.out.println(locale + "_cleanText TABLE already exist (ADDING TO EXISTING cleanText TABLE)");
else {
System.out.println("Creating " + locale + "_cleanText TABLE");
wikiToDB.createWikipediaCleanTextTable();
}
}
System.out.println("Starting Hashtable for wordList.");
int initialCapacity = 200000;
wordList = new HashMap(initialCapacity);
String text;
PrintWriter pw = null;
if (wikiLog != null)
pw = new PrintWriter(new FileWriter(new File(wikiLog)));
StringBuilder textId = new StringBuilder();
int numPagesUsed = 0;
Vector textList;
System.out.println("\nStart processing Wikipedia pages.... Start time:" + dateStringIni + "\n");
for (int i = 0; i < pageId.length; i++) {
// first filter
text = wikiToDB.getTextFromWikiPage(pageId[i], minPageLength, textId, pw);
if (text != null) {
textList = removeMarkup(text);
numPagesUsed++;
for (int j = 0; j < textList.size(); j++) {
text = textList.get(j);
if (text.length() > minTextLength) {
// if after cleaning the text is not empty or
wikiToDB.insertCleanText(text, pageId[i], textId.toString());
// insert the words in text in wordlist
addWordToHashMap(text, wordList);
if (debug)
System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1) + "/"
+ textList.size() + ") length=" + text.length() + " numPagesUsed=" + numPagesUsed
+ " Wordlist[" + wordList.size() + "] ");
if (pw != null)
pw.println("CLEANED PAGE page_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1) + "/"
+ textList.size() + ") length=" + text.length() + " Wordlist[" + wordList.size() + "] "
+ " NUM_PAGES_USED=" + numPagesUsed + " text:\n\n" + text);
} else if (pw != null)
pw.println("PAGE NOT USED AFTER CLEANING page_id[" + i + "]=" + pageId[i] + " length=" + text.length());
} // for each text in textList
System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + " numPagesUsed=" + numPagesUsed + " Wordlist["
+ wordList.size() + "] ");
textList.clear(); // clear the list of text
}
}
Date dateEnd = new Date();
dateStringEnd = fullDate.format(dateEnd);
if (pw != null) {
pw.println("Number of PAGES USED=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " minPageLength="
+ minPageLength + " minTextLength=" + minTextLength + " Start time:" + dateStringIni + " End time:"
+ dateStringEnd);
pw.close();
}
// save the wordList in the DB
updateWordList(wikiToDB, wordList);
wikiToDB.printWordList("./wordlist-freq.txt", "frequency", 0, 0);
System.out.println("\nNumber of pages used=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " Start time:"
+ dateStringIni + " End time:" + dateStringEnd);
// Once created the cleantext table delete the wikipedia text, page and revision tables.
wikiToDB.deleteWikipediaTables();
wikiToDB.closeDBConnection();
}
private void printParameters() {
System.out.println("WikipediaMarkupCleaner parameters:" + "\n -mysqlHost " + getMysqlHost() + "\n -mysqlUser "
+ getMysqlUser() + "\n -mysqlPasswd " + getMysqlPasswd() + "\n -mysqlDB " + getMysqlDB() + "\n -xmlFile "
+ getXmlWikiFile() + "\n -minPage " + getMinPageLength() + "\n -minText " + getMinTextLength()
+ "\n -maxText " + getMaxTextLength() + "\n -log " + getWikiLog() + "\n -debugPageId " + getTestId());
if (getDebug())
System.out.println(" -debug true");
else
System.out.println(" -debug false");
if (getLoadWikiTables())
System.out.println(" -loadWikiTables true");
else
System.out.println(" -loadWikiTables false");
if (getDeleteCleanTextTable())
System.out.println(" -deleteCleanTextTable true\n");
else
System.out.println(" -deleteCleanTextTable false\n");
}
//
/**
* Read and parse the command line args
*
* @param args
* the args
* @return true, if successful, false otherwise
*/
private boolean readArgs(String[] args) {
String help = "\nUsage: java WikipediaMarkupCleaner -locale language -mysqlHost host -mysqlUser user \n"
+ " -mysqlPasswd passwd -mysqlDB wikiDB -xmlFile xmlWikiFile \n"
+ " default/optional: [-minPage 10000 -minText 1000 -maxText 15000] \n"
+ " optional: [-log wikiLogFile -id pageId -debug]\n\n"
+ " -minPage is the minimum size of a wikipedia page that will be considered for cleaning.\n"
+ " -minText is the minimum size of a text to be kept in the DB.\n"
+ " -maxText is used to split big articles in small chunks, this is the maximum chunk size. \n"
+ " -log the wikiLogFile will contain the cleaned text and information about the pages used.\n"
+ " -debug will produce more output and it is mainly used to debug a particular Wikipedia page.\n"
+ " -debugPageId is the page_id number in a wikipedia page table (ex. 18702442), when used this option\n"
+ " the tables will not be loaded, so it is asumed that page, text and revision tables are already loaded.\n"
+ " -noLoadWikiTables use this variable to save time NOT loading wiki tables, they must already exist in the the DB.\n"
+ " -noDeleteCleanTextTable use this variable to do NOT create a new cleanText table, but adding to an already existing\n"
+ " cleanText table.\n";
if (args.length >= 12) { // minimum 12 parameters
for (int i = 0; i < args.length; i++) {
if (args[i].contentEquals("-locale") && args.length >= (i + 1))
setLocale(args[++i]);
else if (args[i].contentEquals("-mysqlHost") && args.length >= (i + 1))
setMysqlHost(args[++i]);
else if (args[i].contentEquals("-mysqlUser") && args.length >= (i + 1))
setMysqlUser(args[++i]);
else if (args[i].contentEquals("-mysqlPasswd") && args.length >= (i + 1))
setMysqlPasswd(args[++i]);
else if (args[i].contentEquals("-mysqlDB") && args.length >= (i + 1))
setMysqlDB(args[++i]);
else if (args[i].contentEquals("-xmlFile") && args.length >= (i + 1))
setXmlWikiFile(args[++i]);
// From here the arguments are optional
else if (args[i].contentEquals("-minPage") && args.length >= (i + 1))
setMinPageLength(Integer.parseInt(args[++i]));
else if (args[i].contentEquals("-minText") && args.length >= (i + 1))
setMinTextLength(Integer.parseInt(args[++i]));
else if (args[i].contentEquals("-maxText") && args.length >= (i + 1))
setMaxTextLength(Integer.parseInt(args[++i]));
else if (args[i].contentEquals("-log") && args.length >= (i + 1))
setWikiLog(args[++i]);
else if (args[i].contentEquals("-debugPageId") && args.length >= (i + 1))
setTestId(args[++i]);
else if (args[i].contentEquals("-debug"))
setDebug(true);
// Use this variable to save time NOT loading wiki tables, they must already exist in the DB
else if (args[i].contentEquals("-noLoadWikiTables"))
setLoadWikiTables(false);
// Use this variable to do not create a new cleanText table, but adding to an already existing cleanText table.
else if (args[i].contentEquals("-noDeleteCleanTextTable"))
setDeleteCleanTextTable(false);
else { // unknown argument
System.out.println("\nOption not known: " + args[i]);
System.out.println(help);
return false;
}
}
} else { // num arguments less than 16
System.out.println(help);
return false;
}
if (getLocale() == null) {
System.out.println("\nMissing locale.");
printParameters();
System.out.println(help);
return false;
}
if (getMysqlHost() == null || getMysqlUser() == null || getMysqlPasswd() == null || getMysqlDB() == null) {
System.out.println("\nMissing required mysql parameters (one/several required variables are null).");
printParameters();
System.out.println(help);
return false;
}
if (getXmlWikiFile() == null) {
System.out.println("\nMissing required parameter, the XML wikipedia file\n");
printParameters();
System.out.println(help);
return false;
}
return true;
}
public static void main(String[] args) throws Exception {
WikipediaMarkupCleaner wikiCleaner = new WikipediaMarkupCleaner();
/* check the arguments */
if (!wikiCleaner.readArgs(args))
return;
wikiCleaner.printParameters();
if (wikiCleaner.getTestId() != null)
wikiCleaner.processWikipediaSQLTablesDebug();
else
wikiCleaner.processWikipediaPages();
}
}