marytts.tools.dbselection.WikipediaDumpSplitter Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2000-2009 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.tools.dbselection;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Vector;
public class WikipediaDumpSplitter {
private int maxPages = 25000;
private String xmlWikipediaDumpFile = null;
private String dirOuputFiles = null;
public void setXmlWikipediaDumpFile(String str) {
xmlWikipediaDumpFile = str;
}
public void setDirOuputFiles(String str) {
dirOuputFiles = str;
}
public void setMaxPages(int val) {
maxPages = val;
}
public String getXmlWikipediaDumpFile() {
return xmlWikipediaDumpFile;
}
public String getDirOuputFiles() {
return dirOuputFiles;
}
public int getMaxPages() {
return maxPages;
}
/***
* This function splits a big XML wikipedia file (ex. 19GB for enwiki) into small XML chunks according to the specified
* maximum number of pages per chunk.
*
* @param xmlFile
* name of the XML wikipedia file.
* @param dirFiles
* directory where to save the small xml chunks.
* @param maxPagesPerChunk
* maximum number of pages per chunk, it can be for example 250000 pages (~30MB).
*/
private void splitWikipediaDump(String xmlFile, String dirFiles, int maxPagesPerChunk) {
int totalPageNumber = 0;
int currentPageNumber = 0;
int numFiles = 0;
String outFileName = "";
String nextLine;
boolean checkSiteInfo = true;
boolean siteInfo = false;
StringBuilder strInfo = new StringBuilder();
FileWriter outputStream = null;
int num = (int) Math.round(maxPagesPerChunk * 0.50);
// we need to scan line by line a big (for ex. 19GB for enwiki) xml file
BufferedReader inputStream = null;
try {
inputStream = new BufferedReader(new FileReader(xmlFile));
while ((nextLine = inputStream.readLine()) != null) {
// get first the siteinfo
if (checkSiteInfo) {
if (nextLine.startsWith(" from header, it will be added to all the xml files.\n");
// System.out.println("siteInfo:" + strInfo);
} else if (nextLine.startsWith(" ")) {
if (currentPageNumber == maxPagesPerChunk) {
outputStream.write("\n");
currentPageNumber = 0;
outputStream.close();
outputStream = null;
}
if (outputStream == null) {
numFiles++;
outFileName = dirFiles + "page" + Integer.toString(numFiles) + ".xml";
System.out.println("outFileName(" + maxPagesPerChunk + "):" + outFileName);
outputStream = new FileWriter(outFileName);
outputStream.write("\n");
// we need the siteinfo at the begining of each chunk
outputStream.write(strInfo.toString());
outputStream.write(nextLine + "\n");
} else
outputStream.write(nextLine + "\n");
if (nextLine.startsWith(" 0) {
System.out.println("number of wikipages = " + totalPageNumber + " last chunk with " + currentPageNumber
+ " pages.");
outputStream.write(" \n");
outputStream.close();
}
} catch (Exception e) {
System.err.println("Exception: " + e.getMessage());
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (Exception e) {
System.err.println("Exception: " + e.getMessage());
}
}
}
/**
* Read and parse the command line args
*
* @param args
* the args
* @return true, if successful, false otherwise
*/
private boolean readArgs(String[] args) {
String help = "\nUsage: java WikipediaDumpSplitter -xmlDump xmlDumpFile -dirOut outputFilesDir -maxPages maxNumberPages \n"
+ " -xmlDump xml wikipedia dump file. \n"
+ " -outDir directory where the small xml chunks will be saved.\n"
+ " -maxPages maximum number of pages of each small xml chunk (if no specified default 25000). \n\n";
if (args.length >= 4) { // minimum 2 parameters
for (int i = 0; i < args.length; i++) {
if (args[i].contentEquals("-xmlDump") && args.length >= (i + 1))
setXmlWikipediaDumpFile(args[++i]);
else if (args[i].contentEquals("-outDir") && args.length >= (i + 1))
setDirOuputFiles(args[++i]);
// this argument is optional
else if (args[i].contentEquals("-maxPages") && args.length >= (i + 1))
setMaxPages(Integer.parseInt(args[++i]));
}
} else {
System.out.println(help);
return false;
}
if (getXmlWikipediaDumpFile() == null || getDirOuputFiles() == null) {
System.out.println("\nMissing required parameter -xmlDump or -dirOut.");
System.out.println(help);
return false;
}
if (getMaxPages() == 0) {
System.out.println("Number of pages per xml file not specified. Using defaul value maxPages = 25000");
setMaxPages(25000);
}
return true;
}
private void printParameters() {
System.out.println("\nWikipediaDumpSplitter parameters:" + "\n -xmlDump " + getXmlWikipediaDumpFile()
+ "\n -outDir " + getDirOuputFiles() + "\n -maxPages " + getMaxPages() + "\n");
}
public static void main(String[] args) throws Exception {
String wFile, cmdLine;
Vector filesToProcess;
Vector filesDone;
WikipediaDumpSplitter wiki = new WikipediaDumpSplitter();
/* check the arguments */
if (!wiki.readArgs(args))
return;
wiki.printParameters();
wiki.splitWikipediaDump(wiki.getXmlWikipediaDumpFile(), wiki.getDirOuputFiles(), wiki.getMaxPages());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy