org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPreprocessing Maven / Gradle / Ivy
/*
* Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fbk.cit.hlt.thewikimachine.xmldump;
import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.fbk.cit.hlt.thewikimachine.util.CharacterTable;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.*;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 1/21/13
* Time: 8:11 AM
* To change this template use File | Settings | File Templates.
*/
public class WikipediaPreprocessing extends AbstractWikipediaExtractor implements WikipediaExtractor {
/**
* Define a static logger variable so that it references the
* Logger instance named WikipediaPreprocessing
.
*/
static Logger logger = Logger.getLogger(WikipediaPreprocessing.class.getName());
private PrintWriter disambiguationWriter;
private PrintWriter titleIdWriter;
//private PrintWriter crossLanguageWriter;
protected Pattern crossLanguagePattern;
private PrintWriter personInfoWriter;
protected Pattern templatePattern;
protected Pattern birthDatePattern;
protected Pattern deathDatePattern;
protected Pattern namePattern;
protected Pattern surnamePattern;
private PrintWriter redirectWriter;
//private PrintWriter textWriter;
private PrintWriter fileWriter;
private PrintWriter pageCategoryWriter;
private PrintWriter superCategoryWriter;
private PrintWriter categoryWriter;
private PrintWriter analysisWriter;
private PrintWriter contentPageTitleWriter;
private PrintWriter templateNameWriter;
// private PrintWriter templateFreqWriter;
private PrintWriter templateMapWriter;
private PrintWriter templateMapWriterWithRepetitions;
private PrintWriter templateMapWriterProp;
private PrintWriter sectionTitleWriter;
private PrintWriter templateNavigationWriter;
private Pattern sectionTitleSkipPattern;
private boolean delCatLabel;
public WikipediaPreprocessing(int numThreads, int numPages, Locale locale) {
this(numThreads, numPages, locale, null);
}
public WikipediaPreprocessing(int numThreads, int numPages, Locale locale, String configurationFolder) {
super(numThreads, numPages, locale, configurationFolder);
this.delCatLabel = true;
}
@Override
public void start(ExtractorParameters extractorParameters) {
if (resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN") != null) {
templatePattern = Pattern.compile(resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN"));
}
if (resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
sectionTitleSkipPattern = Pattern.compile(resources.getString("SECTION_TITLE_SKIP_PATTERN"), Pattern.CASE_INSENSITIVE);
}
if (resources.getString("NAME_PATTERN") != null) {
namePattern = Pattern.compile(resources.getString("NAME_PATTERN"));
}
if (resources.getString("SURNAME_PATTERN") != null) {
surnamePattern = Pattern.compile(resources.getString("SURNAME_PATTERN"));
}
if (resources.getString("BIRTH_DATE_PATTERN") != null && resources.getString("BIRTH_DATE_PATTERN").length() != 0) {
birthDatePattern = Pattern.compile(resources.getString("BIRTH_DATE_PATTERN"));
}
if (resources.getString("DEATH_DATE_PATTERN") != null && resources.getString("DEATH_DATE_PATTERN").length() != 0) {
deathDatePattern = Pattern.compile(resources.getString("DEATH_DATE_PATTERN"));
}
crossLanguagePattern = Pattern.compile("\\[\\[(\\w\\w:[^\\]]+)\\]\\]");
logger.info("templatePattern: " + templatePattern);
logger.info("namePattern: " + namePattern);
logger.info("surnamePattern: " + surnamePattern);
logger.info("birthDatePattern: " + birthDatePattern);
logger.info("deathDatePattern: " + deathDatePattern);
logger.info("crossLanguagePattern: " + crossLanguagePattern);
try {
analysisWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaAnalysisFileName()), "UTF-8")));
disambiguationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaDisambiguationFileName()), "UTF-8")));
titleIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTitleIdFileName()), "UTF-8")));
contentPageTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaContentPageFileName()), "UTF-8")));
//crossLanguageWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCrossLanguageLinkFileName()), "UTF-8")));
personInfoWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPersonInfoFileName()), "UTF-8")));
redirectWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaRedirFileName()), "UTF-8")));
//textWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTextFileName()), "UTF-8")));
pageCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageCategoryFileName()), "UTF-8")));
superCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategorySuperCategoryFileName()), "UTF-8")));
categoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategoryFileName()), "UTF-8")));
sectionTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaSectionTitleFilePrefixName()), "UTF-8")));
fileWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFileName()), "UTF-8")));
templateNameWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("name")), "UTF-8")));
// templateFreqWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("freq")), "UTF-8")));
templateMapWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map")), "UTF-8")));
templateMapWriterWithRepetitions = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-rep")), "UTF-8")));
templateMapWriterProp = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-prop")), "UTF-8")));
templateNavigationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("navigation")), "UTF-8")));
} catch (IOException e) {
logger.error(e);
}
startProcess(extractorParameters.getWikipediaXmlFileName());
}
@Override
public void filePage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
synchronized (this) {
//todo: don't save the prefix File: (in this case change WikipediaFileSourceExtractor.read)
fileWriter.println(title);
}
}
@Override
public void categoryPage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
Matcher m = categoryPattern.matcher(text);
StringBuilder buff = new StringBuilder();
int index = delCatLabel ? 2 : 1;
String category = title;
if (delCatLabel) {
int j = title.indexOf(CharacterTable.COLON) + 1;
category = title.substring(j, title.length());
}
int count = 0;
while (m.find()) {
int s = m.start(index);
int e = m.end(index);
String superCategory = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
/*if (delCatLabel) {
int j = title.indexOf(CharacterTable.COLON) + 1;
buff.append(title.substring(j, title.length()));
}
else {
buff.append(title);
}*/
buff.append(category);
buff.append(CharacterTable.HORIZONTAL_TABULATION);
int j = superCategory.indexOf(CharacterTable.VERTICAL_LINE);
if (j != -1) {
buff.append(normalizePageName(superCategory.substring(0, j)));
}
else {
buff.append(normalizePageName(superCategory));
}
buff.append(CharacterTable.LINE_FEED);
count++;
}
synchronized (this) {
superCategoryWriter.print(buff);
//todo: debug
categoryWriter.println(category + "\t" + count);
}
}
@Override
public void templatePage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
synchronized (this) {
templateNameWriter.println(title);
}
Matcher m;
// Extract categories
//todo: make uppercase the first letter
m = categoryPattern.matcher(text);
int index = 2;
while (m.find()) {
Matcher m2;
int s = m.start(index);
int e = m.end(index);
String category = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
if (navigationTemplatePattern != null) {
m2 = navigationTemplatePattern.matcher(category);
if (m2.find()) {
if (simpleTemplatePattern != null) {
m2 = simpleTemplatePattern.matcher(title);
if (m2.find()) {
String simpleTemplate = m2.group(1);
synchronized (this) {
templateNavigationWriter.append(simpleTemplate);
templateNavigationWriter.append(CharacterTable.LINE_FEED);
}
}
}
}
}
}
}
@Override
public void redirectPage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
StringBuilder sb = new StringBuilder();
sb.append(title);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(normalizePageName(text));
synchronized (this) {
redirectWriter.println(sb.toString());
}
}
@Override
public void contentPage(String text, String title, int wikiID) {
//crossLanguage(text, title);
writeTitleContentPage(title, wikiID);
writeTitlePage(title, wikiID);
personInfo(text, title);
textAndSections(text, title, wikiID);
pageCategory(text, title);
pageTemplate(text, title, wikiID);
}
void pageTemplate(String text, String title, int wikiID) {
ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);
Set set = new HashSet();
Set keySet = new HashSet();
StringBuffer toBeWrittenMap = new StringBuffer();
StringBuffer toBeWrittenMapRep = new StringBuffer();
StringBuffer toBeWrittenMapProp = new StringBuffer();
int i = 0;
for (WikiTemplate t : listOfTemplates) {
HashMap parts = t.getHashMapOfParts();
Set keys = parts.keySet();
String name = t.getFirstPart();
if (name == null || name.length() == 0) {
continue;
}
if (name.startsWith("#")) {
continue;
}
name = normalizePageName(name.trim()).replace(' ', '_');
String toBeWritten;
if (!set.contains(name)) {
toBeWritten = title + "\t" + name + "\t" + i + "\t" + wikiID;
toBeWrittenMap.append(toBeWritten).append("\n");
// synchronized (this) {
// templateFreqSet.add(name);
// }
set.add(name);
i++;
}
toBeWritten = title + "\t" + name + "\t" + t.getPartsCount() + "\t" + t.getNlCount() + "\t" + t.getKeyValueParts();
toBeWrittenMapRep.append(toBeWritten).append("\n");
for (Object key : keys) {
String keyName = (String) key;
String keyNameToSave = name + ";" + keyName;
if (!keySet.contains(keyNameToSave)) {
toBeWritten = title + "\t" + name + "\t" + keyName;
toBeWrittenMapProp.append(toBeWritten).append("\n");
keySet.add(keyNameToSave);
}
}
}
synchronized (this) {
templateMapWriter.print(toBeWrittenMap);
templateMapWriterWithRepetitions.print(toBeWrittenMapRep);
templateMapWriterProp.print(toBeWrittenMapProp);
}
}
void pageCategory(String text, String title) {
Matcher m = categoryPattern.matcher(text);
StringBuilder buff = new StringBuilder();
int index = delCatLabel ? 2 : 1;
while (m.find()) {
int s = m.start(index);
int e = m.end(index);
String category = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
buff.append(title);
buff.append(CharacterTable.HORIZONTAL_TABULATION);
int j = category.indexOf(CharacterTable.VERTICAL_LINE);
if (j != -1) {
buff.append(normalizePageName(category.substring(0, j)));
}
else {
buff.append(normalizePageName(category));
}
buff.append(CharacterTable.LINE_FEED);
}
synchronized (this) {
pageCategoryWriter.print(buff);
}
}
void textAndSections(String text, String title, int wikiID) {
try {
WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance();
String[] prefixes = {filePrefix, imagePrefix};
ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes);
// Text (MOVED)
/*String page = tokenizedText(parsedPage, title);
synchronized (this) {
textWriter.println(page);
} */
// Sections
Matcher matcher;
StringBuilder sb = new StringBuilder();
String sectionTitle;
for (Section section : parsedPage.getSections()) {
sectionTitle = section.getTitle();
// logger.debug(sectionTitle);
if (sectionTitle != null && sectionTitle.trim().length() > 0) {
if (sectionTitleSkipPattern != null) {
matcher = sectionTitleSkipPattern.matcher(sectionTitle);
if (matcher.find()) {
continue;
}
}
sb.append(title);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(sectionTitle);
sb.append(CharacterTable.LINE_FEED);
}
}
synchronized (this) {
sectionTitleWriter.print(sb.toString());
}
} catch (Exception e) {
logger.error("Error processing page " + title + " (" + wikiID + ")");
}
}
/**
* Returns the whole content of the page tokenized in a single line.
* The first token is the page title (with underscores)
*/
/*private String tokenizedText(ParsedPage parsedPage, String title) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(title);
sb.append(CharacterTable.SPACE);
Tokenizer tokenizer = HardTokenizer.getInstance();
String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE));
sb.append(tokenizedTitle);
String rawContent;
String tokenizedContent;
List list;
for (Section section : parsedPage.getSections()) {
list = section.getContentList();
for (int i = 0; i < list.size(); i++) {
rawContent = list.get(i).getText();
if (rawContent.length() > 0) {
tokenizedContent = tokenizer.tokenizedString(rawContent);
if (tokenizedContent.length() > 0) {
sb.append(CharacterTable.SPACE);
sb.append(tokenizedContent);
}
}
}
}
return sb.toString();
} */
void personInfo(String text, String title) {
//todo: check why here!!!
String birthYear = null, deathsYear = null;
String name = null, surname = null;
String templateText = null;
Matcher templateMatcher, birthDateMatcher, deathDateMatcher, nameMatcher, surnameMatcher;
// extract the name
templateMatcher = templatePattern.matcher(text);
if (templateMatcher.find()) {
templateText = templateMatcher.group(1);
//logger.debug(title + "\t" + templateText);
}
if (templateText == null) {
return;
}
templateText = templateText.trim();
//logger.debug(templateText);
// extract the name
nameMatcher = namePattern.matcher(templateText);
if (nameMatcher.find()) {
name = nameMatcher.group(1).trim();
//logger.debug(title + "\t'" + name + "'");
}
// extract the surname
surnameMatcher = surnamePattern.matcher(templateText);
if (surnameMatcher.find()) {
surname = surnameMatcher.group(1).trim();
//logger.debug(title + "\t'" + surname + "'");
}
// extract the birth date
if (birthDatePattern != null) {
birthDateMatcher = birthDatePattern.matcher(templateText);
if (birthDateMatcher.find()) {
birthYear = birthDateMatcher.group(1).trim();
//logger.debug(title + "\t'" + birthYear + "'");
}
}
if (deathDatePattern != null) {
deathDateMatcher = deathDatePattern.matcher(templateText);
if (deathDateMatcher.find()) {
deathsYear = deathDateMatcher.group(1).trim();
//logger.debug(title + "\t'" + deathsYear + "'");
}
}
//if (name != null && surname != null && birthYear != null)
if (name != null && surname != null) {
StringBuilder sb = new StringBuilder();
sb.append(title);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(name.trim());
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(surname.trim());
sb.append(CharacterTable.HORIZONTAL_TABULATION);
if (birthYear != null) {
sb.append(birthYear.trim());
}
sb.append(CharacterTable.HORIZONTAL_TABULATION);
// extract the death date if it exists
if (deathsYear != null) {
sb.append(deathsYear.trim());
}
//personInfoWriter.print("\n");
// personInfoWriter.flush();
synchronized (this) {
personInfoWriter.println(sb.toString());
/*personInfoWriter.print(title);
personInfoWriter.print("\t");
personInfoWriter.print(name.trim());
personInfoWriter.print("\t");
personInfoWriter.print(surname.trim());
personInfoWriter.print("\t");
if (birthYear != null) {
personInfoWriter.print(birthYear.trim());
}
personInfoWriter.print("\t");
// extract the death date if it exists
if (deathsYear != null) {
personInfoWriter.print(deathsYear.trim());
}
personInfoWriter.print("\n");
// personInfoWriter.flush(); */
}
}
}
private void writeTitlePage(String title, int wikiID) {
StringBuilder sb = new StringBuilder();
sb.append(title);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(wikiID);
synchronized (this) {
titleIdWriter.println(sb.toString());
}
}
void writeTitleContentPage(String title, int wikiID) {
StringBuilder sb = new StringBuilder();
sb.append(title);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(wikiID);
synchronized (this) {
contentPageTitleWriter.println(sb.toString());
}
}
/*
void crossLanguage(String text, String title) {
StringBuilder buffer = new StringBuilder();
Matcher m = crossLanguagePattern.matcher(text);
buffer.append(title);
while (m.find()) {
int s = m.start(1);
int e = m.end(1);
String foreignPage = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
buffer.append(StringTable.HORIZONTAL_TABULATION);
buffer.append(foreignPage);
}
synchronized (this) {
crossLanguageWriter.println(buffer.toString());
}
}
*/
@Override
public void portalPage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
}
@Override
public void projectPage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
}
@Override
public void disambiguationPage(String text, String title, int wikiID) {
writeTitlePage(title, wikiID);
synchronized (this) {
disambiguationWriter.println(title);
}
}
void analysis() {
synchronized (this) {
analysisWriter.println("date=" + new Date());
analysisWriter.println("total=" + generalCount);
analysisWriter.println("content=" + countPageCounter);
analysisWriter.println("disambiguation=" + disambiguationPageCounter);
analysisWriter.println("category=" + categoryPageCounter);
analysisWriter.println("redirect=" + redirectPageCounter);
analysisWriter.println("template=" + templatePageCounter);
analysisWriter.println("mediawiki=" + mediawikiPageCounter);
analysisWriter.println("wikipedia=" + wikipediaPageCounter);
analysisWriter.println("file=" + filePageCounter);
analysisWriter.println("special=" + specialPageCounter);
analysisWriter.println("image=" + imagePageCounter);
analysisWriter.println("project=" + projectPageCounter);
analysisWriter.println("other=" + otherPageCounter);
}
}
@Override
public void endProcess() {
super.endProcess();
analysis();
analysisWriter.close();
titleIdWriter.close();
//crossLanguageWriter.close();
disambiguationWriter.close();
personInfoWriter.close();
redirectWriter.close();
//textWriter.close();
pageCategoryWriter.close();
superCategoryWriter.close();
contentPageTitleWriter.close();
// templateFreqWriter.println(count + " pages with at least one template");
templateNameWriter.close();
// templateFreqWriter.close();
templateMapWriter.close();
templateMapWriterWithRepetitions.close();
templateMapWriterProp.close();
templateNavigationWriter.close();
}
public static void main(String args[]) throws IOException {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
Options options = new Options();
try {
Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + AbstractWikipediaXmlDumpParser.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
Option numPageOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of pages to process (default all)").withLongOpt("num-pages").create("p");
Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + AbstractWikipediaExtractor.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");
options.addOption("h", "help", false, "print this message");
options.addOption("v", "version", false, "output version information and exit");
options.addOption(wikipediaDumpOpt);
options.addOption(outputDirOpt);
options.addOption(numThreadOpt);
options.addOption(numPageOpt);
options.addOption(notificationPointOpt);
CommandLineParser parser = new PosixParser();
CommandLine line = parser.parse(options, args);
logger.debug(line);
int numThreads = AbstractWikipediaXmlDumpParser.DEFAULT_THREADS_NUMBER;
if (line.hasOption("num-threads")) {
numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
}
int numPages = AbstractWikipediaExtractor.DEFAULT_NUM_PAGES;
if (line.hasOption("num-pages")) {
numPages = Integer.parseInt(line.getOptionValue("num-pages"));
}
int notificationPoint = AbstractWikipediaExtractor.DEFAULT_NOTIFICATION_POINT;
if (line.hasOption("notification-point")) {
notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
}
ExtractorParameters extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
logger.debug(extractorParameters);
WikipediaExtractor wikipediaExtractor = new WikipediaPreprocessing(numThreads, numPages, extractorParameters.getLocale());
wikipediaExtractor.setNotificationPoint(notificationPoint);
wikipediaExtractor.start(extractorParameters);
} catch (ParseException e) {
// oops, something went wrong
logger.error("Parsing failed: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(200, "java -cp properties:dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPreprocessing", "\n", options, "\n", true);
} finally {
logger.info("extraction ended " + new Date());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy