All Downloads are FREE. Search and download functionalities are using the official Maven repository.

marytts.tools.dbselection.WikipediaMarkupCleaner Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see .
 *
 */
package marytts.tools.dbselection;

import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Vector;

import org.apache.commons.lang.StringEscapeUtils;

/**
 * WikipediaMarkupCleaner
 * 
 * @author Marcela Charfuelan.
 */
public class WikipediaMarkupCleaner {

	// locale
	private String locale = null;
	// mySql database
	private String mysqlHost = null;
	private String mysqlDB = null;
	private String mysqlUser = null;
	private String mysqlPasswd = null;
	// Wikipedia files:
	private String xmlWikiFile = null;
	private String wikiLog = null;
	private boolean debug = false;
	private String debugPageId = null;
	// Default settings for max page length and min and max text length
	private int minPageLength = 10000; // minimum size of a wikipedia page, to be used in the first filtering of pages
	private int minTextLength = 1000;
	private int maxTextLength = 15000; // the average length in one big xml file is approx. 12000

	// Use this variable to save time not loading Wiki tables, if they already exist in the DB
	private boolean loadWikiTables = true;

	// Use this variable to do NOT create a new cleanText table, but adding to an already existing cleanText table.
	private boolean deleteCleanTextTable = true;

	public void setLocale(String str) {
		locale = str;
	}

	public void setMysqlHost(String str) {
		mysqlHost = str;
	}

	public void setMysqlDB(String str) {
		mysqlDB = str;
	}

	public void setMysqlUser(String str) {
		mysqlUser = str;
	}

	public void setMysqlPasswd(String str) {
		mysqlPasswd = str;
	}

	public void setXmlWikiFile(String str) {
		xmlWikiFile = str;
	}

	public void setWikiLog(String str) {
		wikiLog = str;
	}

	public void setTestId(String str) {
		debugPageId = str;
	}

	public void setMinPageLength(int val) {
		minPageLength = val;
	}

	public void setMinTextLength(int val) {
		minTextLength = val;
	}

	public void setMaxTextLength(int val) {
		maxTextLength = val;
	}

	public void setDebug(boolean bval) {
		debug = bval;
	}

	public void setLoadWikiTables(boolean bval) {
		loadWikiTables = bval;
	}

	public void setDeleteCleanTextTable(boolean bval) {
		deleteCleanTextTable = bval;
	}

	public String getLocale() {
		return locale;
	}

	public String getMysqlHost() {
		return mysqlHost;
	}

	public String getMysqlDB() {
		return mysqlDB;
	}

	public String getMysqlUser() {
		return mysqlUser;
	}

	public String getMysqlPasswd() {
		return mysqlPasswd;
	}

	public String getXmlWikiFile() {
		return xmlWikiFile;
	}

	public String getWikiLog() {
		return wikiLog;
	}

	public String getTestId() {
		return debugPageId;
	}

	public int getMinPageLength() {
		return minPageLength;
	}

	public int getMinTextLength() {
		return minTextLength;
	}

	public int getMaxTextLength() {
		return maxTextLength;
	}

	public boolean getDebug() {
		return debug;
	}

	public boolean getLoadWikiTables() {
		return loadWikiTables;
	}

	public boolean getDeleteCleanTextTable() {
		return deleteCleanTextTable;
	}

	public Vector removeMarkup(String page) {
		StringBuffer str = new StringBuffer("");
		StringBuffer line = null;
		Vector textList = new Vector();

		boolean endOfText = false;
		Scanner s = null;
		try {
			s = new Scanner(page);
			while (s.hasNext() && !endOfText) {

				line = new StringBuffer(s.nextLine());
				// process text until it finds any of these labels:
				if (line.indexOf("==References") >= 0 || line.indexOf("== References") >= 0 || line.indexOf("==See also") >= 0
						|| line.indexOf("== See also") >= 0 || line.indexOf("==External links and sources") >= 0
						|| line.indexOf("==External links") >= 0 || line.indexOf("== External links") >= 0
						|| line.indexOf("== External Links") >= 0 || line.indexOf("== External links and sources") >= 0
						|| line.indexOf("==Notes") >= 0 || line.indexOf("== Notes") >= 0 || line.indexOf("==Sources") >= 0
						|| line.indexOf("== Sources") >= 0 || line.indexOf("==Foreign") >= 0 || line.indexOf("== Foreign") >= 0
						|| line.indexOf("==Discussion") >= 0) {
					endOfText = true;
				} else {
					// when removing sections it might add more lines that might contain again more labels to remove
					boolean clean = false;
					while (!clean && line.length() > 0) {
						clean = true;
						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) { // tables
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("{{col-begin}}") >= 0) {
							line = removeSection(s, line, "{{col-begin}}", "{{col-end}}");
							clean = false;
						}

						if (line.indexOf("{|") >= 0) { // this is a table, this should go before {{ because a table can contain {{
														// }}
							line = removeSectionTable(s, line, "{|", "|}");
							clean = false;
						}

						if (line.indexOf("= 0) { // references
							line = removeSectionRef(s, line); // This is special because it can be ,  or />
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}
						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("{{start box}}") >= 0) {
							line = removeSection(s, line, "{{start box}}", "{{end box}}");
							clean = false;
						}

						if (line.indexOf("{{") >= 0) {
							line = removeSection(s, line, "{{", "}}");
							clean = false;
						}

						if (line.indexOf("");
							clean = false;
						}

						if (line.indexOf("\\mathrel{|") >= 0) {
							line = removeSection(s, line, "\\mathrel{|", "}");
							clean = false;
						}

						if (line.indexOf("= 0) { // gallery might contain several images
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("[[Image:") >= 0) {
							line = removeSectionImage(s, line, "[[Image:", "]]");
							clean = false;
						}

						if (line.indexOf("= 0) { // span and div tags are used to separate images from text
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSectionImage(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("") >= 0) {
							line = removeSection(s, line, "", "");
							clean = false;
						}

						if (line.indexOf("") >= 0) {
							line = removeSection(s, line, "", "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

						if (line.indexOf("= 0) {
							line = removeSection(s, line, "");
							clean = false;
						}

					} // while the line/text is not clean (or does not have tags to remove)

					// here filter bulleted and numbered short lines
					if (line.length() > 0) {
						if ((line.toString().startsWith("*") || line.toString().startsWith("#")
								|| line.toString().startsWith(";") || line.toString().startsWith(".")
								|| line.toString().startsWith(",") || line.toString().startsWith("&")
								|| line.toString().startsWith("}") || line.toString().startsWith("]")
								|| line.toString().startsWith("|") || line.toString().startsWith("ca:")
								|| line.toString().startsWith("cs:") || line.toString().startsWith("de:")
								|| line.toString().startsWith("es:") || line.toString().startsWith("fr:")
								|| line.toString().startsWith("it:") || line.toString().startsWith("hu:")
								|| line.toString().startsWith("ja:") || line.toString().startsWith("no:")
								|| line.toString().startsWith("pt:") || line.toString().startsWith("sl:")
								|| line.toString().startsWith("fi:") || line.toString().startsWith("sv:")
								|| line.toString().startsWith("tr:") || line.toString().startsWith("zh:")
								|| line.toString().startsWith("Category:") || line.toString().startsWith("!style=")
								|| line.toString().startsWith("!  style=") || line.toString().startsWith("!align=")
								|| line.toString().startsWith(":: 0) {

						line = new StringBuffer(line.toString().replaceAll("'''''", ""));
						line = new StringBuffer(line.toString().replaceAll("'''", ""));
						line = new StringBuffer(line.toString().replaceAll("''", ""));

						line = processInternalAndExternalLinks(line);

						// this will convert HTML   – etc.
						String strlineNoHTML = StringEscapeUtils.unescapeHtml(line.toString());
						line = new StringBuffer(strlineNoHTML);

						// The previous does not remove all HTML stuff, so here it is done some manually
						line = new StringBuffer(line.toString().replaceAll("", ""));
						line = new StringBuffer(line.toString().replaceAll("", ""));
						line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("
", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("
  • ", "")); line = new StringBuffer(line.toString().replaceAll("
  • ", "")); line = new StringBuffer(line.toString().replaceAll("
  • ", "")); line = new StringBuffer(line.toString().replaceAll("
  • ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("

    ", "")); line = new StringBuffer(line.toString().replaceAll("

    ", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("
    ", "")); line = new StringBuffer(line.toString().replaceAll("", "")); line = new StringBuffer(line.toString().replaceAll("/>", "")); // Removing quotation marks line = new StringBuffer(line.toString().replaceAll("\"", "")); // these quotations have a strange/problematic symbol different from " line = new StringBuffer(line.toString().replaceAll("“", "")); line = new StringBuffer(line.toString().replaceAll("”", "")); // these symbol are also problematic, here they are changed. line = new StringBuffer(line.toString().replaceAll("’", "'")); line = new StringBuffer(line.toString().replaceAll("—", "-")); line = new StringBuffer(line.toString().replaceAll("–", "-")); line = new StringBuffer(line.toString().replaceAll(" ", " ")); line = new StringBuffer(line.toString().replaceAll("…", " ")); // finally sections and lists boolean is_title = false; if (line.toString().startsWith("==")) { is_title = true; } line = new StringBuffer(line.toString().replaceAll("\\s*==+$|==+", "")); if (is_title) { line.append("."); } // bulleted list and numbered list if (line.toString().startsWith("***") || line.toString().startsWith("*#*")) line.replace(0, 3, ""); if (line.toString().startsWith("**") || line.toString().startsWith(":*") || line.toString().startsWith("*#") || line.toString().startsWith("##") || line.toString().startsWith("::")) line.replace(0, 2, ""); if (line.toString().startsWith("*") || line.toString().startsWith("#")) line.replace(0, 1, ""); if (line.toString().startsWith(";") || line.toString().startsWith(";")) // in glossaries definitions start // with ; line.replace(0, 1, ""); // remove this when the text is almost clean if (line.indexOf("= 0) line = removeSection(s, line, ""); line = new StringBuffer(line.toString().replaceAll("", "")); if (line.indexOf("= 0) line = removeSection(s, line, ""); if (line.indexOf("= 0) line = removeSection(s, line, ""); if (line.indexOf("= 0) line = removeSection(s, line, ""); // finally concatenate the line str.append(line); if (!str.toString().endsWith("\n")) str.append("\n"); line = null; // check length of the text if (str.length() > maxTextLength) { textList.add(str.toString()); // System.out.println("\n-----------\n" + str.toString()); str = new StringBuffer(""); } } } // endOfText=false } // while has more lines } finally { if (s != null) s.close(); } if (!str.toString().contentEquals("")) textList.add(str.toString()); return textList; } // This is special because it can be: // ... // // private StringBuffer removeSectionRef(Scanner s, StringBuffer lineIn) { String next; int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0; boolean closeRef = true; StringBuffer line = new StringBuffer(lineIn); StringBuffer nextLine; while ((index1 = line.indexOf("= 0) { // in one line can be more than one reference numRef++; if ((index2 = line.indexOf("", index1)) >= 0) endTagLength = 6 + index2; else if ((index3 = line.indexOf("/>", index1)) >= 0) endTagLength = 2 + index3; if (index2 == -1 && index3 == -1) {// the most be in the next lines, so get more lines until the is // found while (s.hasNext() && numRef != 0) { nextLine = new StringBuffer(s.nextLine()); if (nextLine.indexOf("= 0) numRef++; line.append(nextLine); if ((index2 = line.indexOf("", index1)) >= 0) { numRef--; endTagLength = 6 + index2; } else if ((index3 = line.indexOf("/>", index1)) >= 0) { numRef--; endTagLength = 2 + index3; } } } else // the endTag was found numRef--; if (numRef == 0) { index1 = line.indexOf(" index1) { line.delete(index1, endTagLength); // System.out.println("nextline="+line); } else { if (debug) { System.out.print("iniTag: length of line: " + line); // line.delete(index1, line.length()); } line = new StringBuffer(""); } } else { if (debug) System.out.println("removeSectionRef: WARNING no or /> in " + line); // line.delete(index1, line.length()); line = new StringBuffer(""); } } // while this line contains iniTag-s return line; } private StringBuffer removeSection(Scanner s, StringBuffer lineIn, String iniTag, String endTag) { String next; int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0; boolean closeRef = true; StringBuffer line = new StringBuffer(lineIn); StringBuffer nextLine; if (debug) System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line); while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag numRef++; if ((index2 = line.indexOf(endTag, index1)) >= 0) endTagLength = endTag.length() + index2; if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found lastEndTag = 0; // start to look for the endTag in 0 while (s.hasNext() && numRef != 0) { lastIniTag = 0; nextLine = new StringBuffer(s.nextLine()); // if(debug) // System.out.println(" NEXTLINE: " + nextLine); while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) { numRef++; lastIniTag = iniTag.length() + index1; } line.append(nextLine); // next time it will look for the endTag after the position of the last it found. while ((index2 = line.indexOf(endTag, lastEndTag)) >= 0) { numRef--; lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found endTagLength = endTag.length() + index2; } // if(debug) // System.out.println("LINE (numRef=" + numRef + "): " + line); } } else // the endTag was found numRef--; if (numRef == 0) { index1 = line.indexOf(iniTag); // get again this because the position might change if (endTagLength > index1) { if (debug) { System.out.println(" FINAL LINE: " + line); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println(" line.length=" + line.length()); } line.delete(index1, endTagLength); } else { if (debug) { System.out.println("removeSection: WARNING endTagLength > length of line: "); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println("removeSection: WARNING endTagLength > length of line: " + line); } line = new StringBuffer(""); } // System.out.println("nextline="+line); } else { if (debug) System.out.println("removeSection: WARNING no " + endTag); line = new StringBuffer(""); } } // while this line contains iniTag-s if (debug) System.out.println(" LINE (AFTER): " + line); return line; } private StringBuffer removeSectionTable(Scanner s, StringBuffer lineIn, String iniTag, String endTag) { String next; int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0; boolean closeRef = true; StringBuffer line = new StringBuffer(lineIn); StringBuffer nextLine; if (debug) System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line); while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag numRef++; if ((index2 = line.indexOf(endTag, index1)) >= 0) endTagLength = endTag.length() + index2; if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found lastEndTag = 0; // start to look for the endTag in 0 while (s.hasNext() && numRef != 0) { lastIniTag = 0; nextLine = new StringBuffer(s.nextLine()); // if(debug) // System.out.println(" NEXTLINE: " + nextLine); while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) { numRef++; lastIniTag = iniTag.length() + index1; } // next time it will look for the endTag after the position of the last it found. // while( (index2 = line.indexOf(endTag, lastEndTag)) >= 0 ){ if (nextLine.toString().startsWith(endTag)) { numRef--; // index2 = line.length(); // lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found endTagLength = line.length() + endTag.length(); } line.append(nextLine); // if(debug) // System.out.println("LINE (numRef=" + numRef + "): " + line); } } else // the endTag was found numRef--; if (numRef == 0) { index1 = line.indexOf(iniTag); // get again this because the position might change if (endTagLength > index1) { if (debug) { System.out.println(" FINAL LINE: " + line); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println(" line.length=" + line.length()); } line.delete(index1, endTagLength); } else { if (debug) { System.out.println("removeSection: WARNING endTagLength > length of line: "); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println("removeSection: WARNING endTagLength > length of line: " + line); } line = new StringBuffer(""); } // System.out.println("nextline="+line); } else { if (debug) System.out.println("removeSection: WARNING no " + endTag); line = new StringBuffer(""); } } // while this line contains iniTag-s if (debug) System.out.println(" LINE (AFTER): " + line); return line; } /**** * This is also special because the line might contain sections with [[ ... ]] so the ]] after a [[ is not the endTag of * [[image: ... ]] * * @param s * s * @param lineIn * lineIn * @param iniTag * iniTag * @param endTag * endTag * @return line */ private StringBuffer removeSectionImage(Scanner s, StringBuffer lineIn, String iniTag, String endTag) { String next; int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0, lastEndTag1 = 0, lastIniTag = 0; boolean closeRef = true; StringBuffer line = new StringBuffer(lineIn); StringBuffer nextLine; StringBuffer aux; if (debug) System.out.println("Removing tag: " + iniTag + " LINE (BEFORE): " + line); while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag numRef++; index3 = endTagLength = index1; while (s.hasNext() && numRef > 0) { while ((index2 = line.indexOf("]]", endTagLength)) >= 0 && numRef > 0) { aux = new StringBuffer(line.subSequence(index1 + 2, index2 + 2)); if (debug) System.out.println(" aux=" + aux); if ((index3 = aux.indexOf("[[")) == -1) { endTagLength = endTag.length() + index2; numRef--; } else { // The previous was a [[ ]] inside of a [[Image: so it has to be deleted index1 = index2; endTagLength = index2 + 2; index2 = -1; } } // so far it has not found the endTag, so get another line if (numRef > 0) line.append(s.nextLine()); } if (numRef == 0) { index1 = line.indexOf(iniTag); // get again this because the position might change if (endTagLength > index1) { if (debug) { System.out.println(" FINAL LINE: " + line); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println(" line.length=" + line.length()); } line.delete(index1, endTagLength); } else { if (debug) { System.out.println("removeSection: WARNING endTagLength > length of line: "); System.out.print("iniTag: " + iniTag + " index1=" + index1); System.out.print(" endTagLength=" + endTagLength); System.out.println(" line.length=" + line.length() + " line: " + line); System.out.println("removeSection: WARNING endTagLength > length of line: " + line); } line = new StringBuffer(""); } } else { if (debug) System.out.println("removeSection: WARNING no " + endTag); line = new StringBuffer(""); } } // while this line contains iniTag-s if (debug) System.out.println(" LINE (AFTER): " + line); return line; } /*** * Internal links: [[Name of page]] [[Name of page|Text to display]] External links: [http://www.example.org Text to display] * [http://www.example.org] http://www.example.org * * @param line * line */ private StringBuffer processInternalAndExternalLinks(StringBuffer line) { int index1, index2, index3; StringBuffer linetmp = null; // for debugging boolean changed = false; if (debug) linetmp = new StringBuffer(line); // Internal links: while ((index1 = line.indexOf("[[")) >= 0) { changed = true; if ((index2 = line.indexOf("]]")) >= 0) { if ((index3 = line.indexOf("|", index1)) >= 0 && index3 < index2) { // if there is text to display line.delete(index1, index3 + 1); // delete the link and [[ ]] index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]] line.delete(index2, index2 + 2); } else { line.delete(index1, index1 + 2); // delete the [[ index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]] line.delete(index2, index2 + 2); // delete the ]] -2 because in the previous line i deleted two chars } // if(debug) // System.out.println("LINE (AFTER): " + line); } else { if (debug) { System.out.println("processInternalAndExternalLinks: WARNING no ]] tag in " + line); System.out.println("deleting [["); } line.delete(index1, index1 + 2); // delete the [[ } } // External links: just the ones started with [http: and here I am deleting the whole reference // i am not keeping the text to display of this link. while ((index1 = line.indexOf("[http:")) >= 0 || (index1 = line.indexOf("[https:")) >= 0) { // System.out.println("LINE(BEFORE): " + line); if ((index2 = line.indexOf("]", index1)) >= 0) { // line.delete(index1, index2+1); if ((index3 = line.indexOf(" ", index1)) >= 0 && index3 < index2) { // if there is text to display line.delete(index1, index3 + 1); // delete the link and [http: until first black space before ] index2 = line.indexOf("]"); // since i delete some text i need to find again the next ]] line.delete(index2, index2 + 1); } else { line.delete(index1, index2 + 1); // no text to display, delete the whole ref } // System.out.println("LINE (AFTER): " + line + "\n"); } else { if (debug) { System.out.println("processInternalAndExternalLinks: WARNING no ] tag when processing lines with http: line=" + line); System.out.println("deleting ["); } line.delete(index1, index1 + 1); // delete the [ } } if (debug && changed) { System.out.println("Removing links, LINE(BEFORE): " + linetmp); System.out.println(" LINE (AFTER): " + line); } return line; } public void addWordToHashMap(String text, HashMap wordList) { String sentences[]; String words[], w; Integer i; int m, n; sentences = text.split("\n"); for (m = 0; m < sentences.length; m++) { // System.out.println("\n" + sentences[m]); words = sentences[m].split(" "); for (n = 0; n < words.length; n++) { w = words[n]; // System.out.print("word=" + words[n] + " -->"); // Split into letter sections that we will consider atomic "words": int start = 0, end = 0; int minimumLength = 2; for (; end < w.length(); end++) { // if (Character.isLetter(w.charAt(end))) { if (marytts.util.string.StringUtils.isLetterOrModifier(w.codePointAt(end))) { if (start < 0) start = end; continue; } // not a letter if (start >= 0 && end - start >= minimumLength) { String oneWord = w.substring(start, end); // System.out.print(" oneWord1=" + oneWord); Integer count = (Integer) wordList.get(oneWord); // if key is not in the map then give it value one // otherwise increment its value by 1 if (count == null) wordList.put(oneWord, new Integer(1)); else wordList.put(oneWord, new Integer(count.intValue() + 1)); } start = -1; } if (start >= 0 && end - start >= minimumLength) { String oneWord = w.substring(start, end); // System.out.print(" oneWord2=" + oneWord); Integer count = (Integer) wordList.get(oneWord); // if key is not in the map then give it value one // otherwise increment its value by 1 if (count == null) wordList.put(oneWord, new Integer(1)); else wordList.put(oneWord, new Integer(count.intValue() + 1)); } /* * // remove punctuation if( w.endsWith(",") || w.endsWith(";") || w.endsWith(".") || w.endsWith(":") || * w.endsWith("'") || w.endsWith(")") || w.endsWith("?") ) w = w.substring(0, (w.length()-1)); if( * w.endsWith("'s") ) w = w.substring(0, (w.length()-2)); if(w.startsWith("(") ) w = w.substring(1, w.length()); * * if( w.length()>1 && StringUtils.isAlpha(w) && StringUtils.isNotBlank(w) && StringUtils.isNotEmpty(w) && * StringUtils.isAsciiPrintable(w)) { //System.out.print(w + " "); i = (Integer) wordList.get(w); // if key is not * in the map then give it value one // otherwise increment its value by 1 if(i==null) wordList.put(w, new * Integer(1)); else wordList.put(w, new Integer( i.intValue() + 1)); } // if word is > 1 and isAlpha */ // System.out.println("\n"); } // System.out.println("\n"); words = null; } sentences = null; } public void updateWordList(DBHandler wikiToDB, HashMap wlNew) { String w; HashMap wlOld; Integer freq; Integer i; // Checking if word list exist if (wikiToDB.tableExist(locale + "_wordList")) { System.out.println("Updating " + locale + "_wordList in DB table...."); wlOld = wikiToDB.getMostFrequentWords(0, 0); // combine the two tables Iterator iterator = wlNew.keySet().iterator(); while (iterator.hasNext()) { w = iterator.next().toString(); freq = wlNew.get(w); i = (Integer) wlOld.get(w); // if key is not in the map then give it value freq // otherwise increment its value by freq if (i == null) wlOld.put(w, new Integer(freq)); else wlOld.put(w, new Integer(i.intValue() + freq)); } wikiToDB.insertWordList(wlOld); System.out.println("Final size of wordList after combining old and new lists: wordList=[" + wlOld.size() + "]"); } else { System.out.println("Saving " + locale + "_wordList table...."); wikiToDB.insertWordList(wlNew); } } void processWikipediaSQLTablesDebug() throws Exception { DBHandler wikiToDB = new DBHandler(locale); wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd); String text; StringBuilder textId = new StringBuilder(); int numPagesUsed = 0; PrintWriter pw = null; if (wikiLog != null) pw = new PrintWriter(new FileWriter(new File(wikiLog))); // get text from the DB text = wikiToDB.getTextFromWikiPage(debugPageId, minPageLength, textId, pw); System.out.println("\nPAGE SIZE=" + text.length() + " text:\n" + text); Vector textList; if (text != null) { textList = removeMarkup(text); System.out.println("\nCLEANED TEXT:"); for (int i = 0; i < textList.size(); i++) System.out.println("text(" + i + "): \n" + textList.get(i)); } else System.out.println("NO CLEANED TEXT."); if (pw != null) pw.close(); wikiToDB.closeDBConnection(); } /*** * Using mwdumper extracts pages from a xmlWikiFile and load them in a mysql DB (it loads the tables "locale_text", * "locale_page" and "locale_revision", where locale is the corresponding wikipedia language). Once the tables are loaded, * extract/clean text from the pages and create a cleanText table. It also creates a wordList table including frequencies. * * @throws Exception * Exception */ void processWikipediaPages() throws Exception { // Load wikipedia pages, extract clean text and create word list. String dateStringIni = "", dateStringEnd = ""; DateFormat fullDate = new SimpleDateFormat("dd_MM_yyyy_HH:mm:ss"); Date dateIni = new Date(); dateStringIni = fullDate.format(dateIni); DBHandler wikiToDB = new DBHandler(locale); // hashMap for the dictionary, HashMap is faster than TreeMap so the list of words will // be kept it in a hashMap. When the process finish the hashMap will be dump in the database. HashMap wordList; System.out.println("Creating connection to DB server..."); wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd); // This loading can take a while // create and load TABLES: page, text and revision if (loadWikiTables) { System.out.println("Creating and loading TABLES: page, text and revision. (The loading can take a while...)"); wikiToDB.loadPagesWithMWDumper(xmlWikiFile, locale, mysqlHost, mysqlDB, mysqlUser, mysqlPasswd); } else { // Checking if tables are already created and loaded in the DB if (wikiToDB.checkWikipediaTables()) System.out.println("TABLES " + locale + "_page, " + locale + "_text and " + locale + "_revision already loaded (WARNING USING EXISTING WIKIPEDIA TABLES)."); else throw new Exception("WikipediaMarkupCleaner: ERROR IN TABLES " + locale + "_page, " + locale + "_text and " + locale + "_revision, they are not CREATED/LOADED."); } System.out.println("\nGetting page IDs"); String pageId[]; pageId = wikiToDB.getIds("page_id", locale + "_page"); System.out.println("Number of page IDs to process: " + pageId.length + "\n"); // create cleanText TABLE if (deleteCleanTextTable) { System.out.println("Creating (deleting if already exist) " + locale + "_cleanText TABLE"); wikiToDB.createWikipediaCleanTextTable(); } else { if (wikiToDB.tableExist(locale + "_cleanText")) System.out.println(locale + "_cleanText TABLE already exist (ADDING TO EXISTING cleanText TABLE)"); else { System.out.println("Creating " + locale + "_cleanText TABLE"); wikiToDB.createWikipediaCleanTextTable(); } } System.out.println("Starting Hashtable for wordList."); int initialCapacity = 200000; wordList = new HashMap(initialCapacity); String text; PrintWriter pw = null; if (wikiLog != null) pw = new PrintWriter(new FileWriter(new File(wikiLog))); StringBuilder textId = new StringBuilder(); int numPagesUsed = 0; Vector textList; System.out.println("\nStart processing Wikipedia pages.... Start time:" + dateStringIni + "\n"); for (int i = 0; i < pageId.length; i++) { // first filter text = wikiToDB.getTextFromWikiPage(pageId[i], minPageLength, textId, pw); if (text != null) { textList = removeMarkup(text); numPagesUsed++; for (int j = 0; j < textList.size(); j++) { text = textList.get(j); if (text.length() > minTextLength) { // if after cleaning the text is not empty or wikiToDB.insertCleanText(text, pageId[i], textId.toString()); // insert the words in text in wordlist addWordToHashMap(text, wordList); if (debug) System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1) + "/" + textList.size() + ") length=" + text.length() + " numPagesUsed=" + numPagesUsed + " Wordlist[" + wordList.size() + "] "); if (pw != null) pw.println("CLEANED PAGE page_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1) + "/" + textList.size() + ") length=" + text.length() + " Wordlist[" + wordList.size() + "] " + " NUM_PAGES_USED=" + numPagesUsed + " text:\n\n" + text); } else if (pw != null) pw.println("PAGE NOT USED AFTER CLEANING page_id[" + i + "]=" + pageId[i] + " length=" + text.length()); } // for each text in textList System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + " numPagesUsed=" + numPagesUsed + " Wordlist[" + wordList.size() + "] "); textList.clear(); // clear the list of text } } Date dateEnd = new Date(); dateStringEnd = fullDate.format(dateEnd); if (pw != null) { pw.println("Number of PAGES USED=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " minPageLength=" + minPageLength + " minTextLength=" + minTextLength + " Start time:" + dateStringIni + " End time:" + dateStringEnd); pw.close(); } // save the wordList in the DB updateWordList(wikiToDB, wordList); wikiToDB.printWordList("./wordlist-freq.txt", "frequency", 0, 0); System.out.println("\nNumber of pages used=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " Start time:" + dateStringIni + " End time:" + dateStringEnd); // Once created the cleantext table delete the wikipedia text, page and revision tables. wikiToDB.deleteWikipediaTables(); wikiToDB.closeDBConnection(); } private void printParameters() { System.out.println("WikipediaMarkupCleaner parameters:" + "\n -mysqlHost " + getMysqlHost() + "\n -mysqlUser " + getMysqlUser() + "\n -mysqlPasswd " + getMysqlPasswd() + "\n -mysqlDB " + getMysqlDB() + "\n -xmlFile " + getXmlWikiFile() + "\n -minPage " + getMinPageLength() + "\n -minText " + getMinTextLength() + "\n -maxText " + getMaxTextLength() + "\n -log " + getWikiLog() + "\n -debugPageId " + getTestId()); if (getDebug()) System.out.println(" -debug true"); else System.out.println(" -debug false"); if (getLoadWikiTables()) System.out.println(" -loadWikiTables true"); else System.out.println(" -loadWikiTables false"); if (getDeleteCleanTextTable()) System.out.println(" -deleteCleanTextTable true\n"); else System.out.println(" -deleteCleanTextTable false\n"); } // /** * Read and parse the command line args * * @param args * the args * @return true, if successful, false otherwise */ private boolean readArgs(String[] args) { String help = "\nUsage: java WikipediaMarkupCleaner -locale language -mysqlHost host -mysqlUser user \n" + " -mysqlPasswd passwd -mysqlDB wikiDB -xmlFile xmlWikiFile \n" + " default/optional: [-minPage 10000 -minText 1000 -maxText 15000] \n" + " optional: [-log wikiLogFile -id pageId -debug]\n\n" + " -minPage is the minimum size of a wikipedia page that will be considered for cleaning.\n" + " -minText is the minimum size of a text to be kept in the DB.\n" + " -maxText is used to split big articles in small chunks, this is the maximum chunk size. \n" + " -log the wikiLogFile will contain the cleaned text and information about the pages used.\n" + " -debug will produce more output and it is mainly used to debug a particular Wikipedia page.\n" + " -debugPageId is the page_id number in a wikipedia page table (ex. 18702442), when used this option\n" + " the tables will not be loaded, so it is asumed that page, text and revision tables are already loaded.\n" + " -noLoadWikiTables use this variable to save time NOT loading wiki tables, they must already exist in the the DB.\n" + " -noDeleteCleanTextTable use this variable to do NOT create a new cleanText table, but adding to an already existing\n" + " cleanText table.\n"; if (args.length >= 12) { // minimum 12 parameters for (int i = 0; i < args.length; i++) { if (args[i].contentEquals("-locale") && args.length >= (i + 1)) setLocale(args[++i]); else if (args[i].contentEquals("-mysqlHost") && args.length >= (i + 1)) setMysqlHost(args[++i]); else if (args[i].contentEquals("-mysqlUser") && args.length >= (i + 1)) setMysqlUser(args[++i]); else if (args[i].contentEquals("-mysqlPasswd") && args.length >= (i + 1)) setMysqlPasswd(args[++i]); else if (args[i].contentEquals("-mysqlDB") && args.length >= (i + 1)) setMysqlDB(args[++i]); else if (args[i].contentEquals("-xmlFile") && args.length >= (i + 1)) setXmlWikiFile(args[++i]); // From here the arguments are optional else if (args[i].contentEquals("-minPage") && args.length >= (i + 1)) setMinPageLength(Integer.parseInt(args[++i])); else if (args[i].contentEquals("-minText") && args.length >= (i + 1)) setMinTextLength(Integer.parseInt(args[++i])); else if (args[i].contentEquals("-maxText") && args.length >= (i + 1)) setMaxTextLength(Integer.parseInt(args[++i])); else if (args[i].contentEquals("-log") && args.length >= (i + 1)) setWikiLog(args[++i]); else if (args[i].contentEquals("-debugPageId") && args.length >= (i + 1)) setTestId(args[++i]); else if (args[i].contentEquals("-debug")) setDebug(true); // Use this variable to save time NOT loading wiki tables, they must already exist in the DB else if (args[i].contentEquals("-noLoadWikiTables")) setLoadWikiTables(false); // Use this variable to do not create a new cleanText table, but adding to an already existing cleanText table. else if (args[i].contentEquals("-noDeleteCleanTextTable")) setDeleteCleanTextTable(false); else { // unknown argument System.out.println("\nOption not known: " + args[i]); System.out.println(help); return false; } } } else { // num arguments less than 16 System.out.println(help); return false; } if (getLocale() == null) { System.out.println("\nMissing locale."); printParameters(); System.out.println(help); return false; } if (getMysqlHost() == null || getMysqlUser() == null || getMysqlPasswd() == null || getMysqlDB() == null) { System.out.println("\nMissing required mysql parameters (one/several required variables are null)."); printParameters(); System.out.println(help); return false; } if (getXmlWikiFile() == null) { System.out.println("\nMissing required parameter, the XML wikipedia file\n"); printParameters(); System.out.println(help); return false; } return true; } public static void main(String[] args) throws Exception { WikipediaMarkupCleaner wikiCleaner = new WikipediaMarkupCleaner(); /* check the arguments */ if (!wikiCleaner.readArgs(args)) return; wikiCleaner.printParameters(); if (wikiCleaner.getTestId() != null) wikiCleaner.processWikipediaSQLTablesDebug(); else wikiCleaner.processWikipediaPages(); } }



    © 2015 - 2025 Weber Informatics LLC | Privacy Policy