All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPreprocessing Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fbk.cit.hlt.thewikimachine.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.fbk.cit.hlt.thewikimachine.util.CharacterTable;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.*;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 1/21/13
 * Time: 8:11 AM
 * To change this template use File | Settings | File Templates.
 */
public class WikipediaPreprocessing extends AbstractWikipediaExtractor implements WikipediaExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named WikipediaPreprocessing.
	 */
	static Logger logger = Logger.getLogger(WikipediaPreprocessing.class.getName());

	private PrintWriter disambiguationWriter;

	private PrintWriter titleIdWriter;

	//private PrintWriter crossLanguageWriter;

	protected Pattern crossLanguagePattern;

	private PrintWriter personInfoWriter;

	protected Pattern templatePattern;

	protected Pattern birthDatePattern;

	protected Pattern deathDatePattern;

	protected Pattern namePattern;

	protected Pattern surnamePattern;

	private PrintWriter redirectWriter;

	//private PrintWriter textWriter;

	private PrintWriter fileWriter;

	private PrintWriter pageCategoryWriter;

	private PrintWriter superCategoryWriter;

	private PrintWriter categoryWriter;

	private PrintWriter analysisWriter;

	private PrintWriter contentPageTitleWriter;

	private PrintWriter templateNameWriter;

	// private PrintWriter templateFreqWriter;

	private PrintWriter templateMapWriter;

	private PrintWriter templateMapWriterWithRepetitions;

	private PrintWriter templateMapWriterProp;

	private PrintWriter sectionTitleWriter;

	private PrintWriter templateNavigationWriter;

	private Pattern sectionTitleSkipPattern;

	private boolean delCatLabel;

	public WikipediaPreprocessing(int numThreads, int numPages, Locale locale) {
		this(numThreads, numPages, locale, null);
	}

	public WikipediaPreprocessing(int numThreads, int numPages, Locale locale, String configurationFolder) {
		super(numThreads, numPages, locale, configurationFolder);
		this.delCatLabel = true;
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		if (resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN") != null) {
			templatePattern = Pattern.compile(resources.getString("PERSONAL_DATA_TEMPLATE_PATTERN"));
		}

		if (resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
			sectionTitleSkipPattern = Pattern.compile(resources.getString("SECTION_TITLE_SKIP_PATTERN"), Pattern.CASE_INSENSITIVE);
		}

		if (resources.getString("NAME_PATTERN") != null) {
			namePattern = Pattern.compile(resources.getString("NAME_PATTERN"));
		}

		if (resources.getString("SURNAME_PATTERN") != null) {
			surnamePattern = Pattern.compile(resources.getString("SURNAME_PATTERN"));
		}

		if (resources.getString("BIRTH_DATE_PATTERN") != null && resources.getString("BIRTH_DATE_PATTERN").length() != 0) {
			birthDatePattern = Pattern.compile(resources.getString("BIRTH_DATE_PATTERN"));
		}

		if (resources.getString("DEATH_DATE_PATTERN") != null && resources.getString("DEATH_DATE_PATTERN").length() != 0) {
			deathDatePattern = Pattern.compile(resources.getString("DEATH_DATE_PATTERN"));
		}

		crossLanguagePattern = Pattern.compile("\\[\\[(\\w\\w:[^\\]]+)\\]\\]");

		logger.info("templatePattern: " + templatePattern);
		logger.info("namePattern: " + namePattern);
		logger.info("surnamePattern: " + surnamePattern);
		logger.info("birthDatePattern: " + birthDatePattern);
		logger.info("deathDatePattern: " + deathDatePattern);
		logger.info("crossLanguagePattern: " + crossLanguagePattern);


		try {
			analysisWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaAnalysisFileName()), "UTF-8")));
			disambiguationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaDisambiguationFileName()), "UTF-8")));
			titleIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTitleIdFileName()), "UTF-8")));
			contentPageTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaContentPageFileName()), "UTF-8")));
			//crossLanguageWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCrossLanguageLinkFileName()), "UTF-8")));
			personInfoWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPersonInfoFileName()), "UTF-8")));
			redirectWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaRedirFileName()), "UTF-8")));
			//textWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTextFileName()), "UTF-8")));
			pageCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageCategoryFileName()), "UTF-8")));
			superCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategorySuperCategoryFileName()), "UTF-8")));
			categoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaCategoryFileName()), "UTF-8")));
			sectionTitleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaSectionTitleFilePrefixName()), "UTF-8")));
			fileWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFileName()), "UTF-8")));

			templateNameWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("name")), "UTF-8")));
			// templateFreqWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("freq")), "UTF-8")));
			templateMapWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map")), "UTF-8")));
			templateMapWriterWithRepetitions = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-rep")), "UTF-8")));
			templateMapWriterProp = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("map-prop")), "UTF-8")));

			templateNavigationWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaTemplateFileNames().get("navigation")), "UTF-8")));

		} catch (IOException e) {
			logger.error(e);
		}
		startProcess(extractorParameters.getWikipediaXmlFileName());
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
		synchronized (this) {
			//todo: don't save the prefix File: (in this case change WikipediaFileSourceExtractor.read)
			fileWriter.println(title);
		}
	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
		Matcher m = categoryPattern.matcher(text);
		StringBuilder buff = new StringBuilder();
		int index = delCatLabel ? 2 : 1;
		String category = title;
		if (delCatLabel) {
			int j = title.indexOf(CharacterTable.COLON) + 1;
			category = title.substring(j, title.length());
		}

		int count = 0;
		while (m.find()) {
			int s = m.start(index);
			int e = m.end(index);
			String superCategory = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
			/*if (delCatLabel) {
				int j = title.indexOf(CharacterTable.COLON) + 1;
				buff.append(title.substring(j, title.length()));
			}
			else {
				buff.append(title);
			}*/
			buff.append(category);
			buff.append(CharacterTable.HORIZONTAL_TABULATION);

			int j = superCategory.indexOf(CharacterTable.VERTICAL_LINE);
			if (j != -1) {
				buff.append(normalizePageName(superCategory.substring(0, j)));
			}
			else {
				buff.append(normalizePageName(superCategory));
			}
			buff.append(CharacterTable.LINE_FEED);
			count++;
		}
		synchronized (this) {
			superCategoryWriter.print(buff);
			//todo: debug
			categoryWriter.println(category + "\t" + count);
		}

	}

	@Override
	public void templatePage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
		synchronized (this) {
			templateNameWriter.println(title);
		}

		Matcher m;

		// Extract categories
		//todo: make uppercase the first letter
		m = categoryPattern.matcher(text);
		int index = 2;
		while (m.find()) {
			Matcher m2;
			int s = m.start(index);
			int e = m.end(index);
			String category = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);

			if (navigationTemplatePattern != null) {
				m2 = navigationTemplatePattern.matcher(category);
				if (m2.find()) {
					if (simpleTemplatePattern != null) {
						m2 = simpleTemplatePattern.matcher(title);
						if (m2.find()) {
							String simpleTemplate = m2.group(1);
							synchronized (this) {
								templateNavigationWriter.append(simpleTemplate);
								templateNavigationWriter.append(CharacterTable.LINE_FEED);
							}
						}
					}
				}
			}
		}

	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
		StringBuilder sb = new StringBuilder();
		sb.append(title);
		sb.append(CharacterTable.HORIZONTAL_TABULATION);
		sb.append(normalizePageName(text));

		synchronized (this) {
			redirectWriter.println(sb.toString());
		}
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {
		//crossLanguage(text, title);
		writeTitleContentPage(title, wikiID);
		writeTitlePage(title, wikiID);
		personInfo(text, title);
		textAndSections(text, title, wikiID);
		pageCategory(text, title);
		pageTemplate(text, title, wikiID);
	}

	void pageTemplate(String text, String title, int wikiID) {
		ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);

		Set set = new HashSet();
		Set keySet = new HashSet();

		StringBuffer toBeWrittenMap = new StringBuffer();
		StringBuffer toBeWrittenMapRep = new StringBuffer();
		StringBuffer toBeWrittenMapProp = new StringBuffer();

		int i = 0;
		for (WikiTemplate t : listOfTemplates) {
			HashMap parts = t.getHashMapOfParts();
			Set keys = parts.keySet();
			String name = t.getFirstPart();
			if (name == null || name.length() == 0) {
				continue;
			}
			if (name.startsWith("#")) {
				continue;
			}
			name = normalizePageName(name.trim()).replace(' ', '_');
			String toBeWritten;

			if (!set.contains(name)) {
				toBeWritten = title + "\t" + name + "\t" + i + "\t" + wikiID;
				toBeWrittenMap.append(toBeWritten).append("\n");
//				synchronized (this) {
//					templateFreqSet.add(name);
//				}
				set.add(name);
				i++;
			}
			toBeWritten = title + "\t" + name + "\t" + t.getPartsCount() + "\t" + t.getNlCount() + "\t" + t.getKeyValueParts();
			toBeWrittenMapRep.append(toBeWritten).append("\n");

			for (Object key : keys) {
				String keyName = (String) key;
				String keyNameToSave = name + ";" + keyName;
				if (!keySet.contains(keyNameToSave)) {
					toBeWritten = title + "\t" + name + "\t" + keyName;
					toBeWrittenMapProp.append(toBeWritten).append("\n");
					keySet.add(keyNameToSave);
				}
			}
		}

		synchronized (this) {
			templateMapWriter.print(toBeWrittenMap);
			templateMapWriterWithRepetitions.print(toBeWrittenMapRep);
			templateMapWriterProp.print(toBeWrittenMapProp);
		}

	}

	void pageCategory(String text, String title) {
		Matcher m = categoryPattern.matcher(text);
		StringBuilder buff = new StringBuilder();
		int index = delCatLabel ? 2 : 1;
		while (m.find()) {
			int s = m.start(index);
			int e = m.end(index);
			String category = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
			buff.append(title);
			buff.append(CharacterTable.HORIZONTAL_TABULATION);

			int j = category.indexOf(CharacterTable.VERTICAL_LINE);
			if (j != -1) {
				buff.append(normalizePageName(category.substring(0, j)));
			}
			else {
				buff.append(normalizePageName(category));
			}
			buff.append(CharacterTable.LINE_FEED);
		}

		synchronized (this) {
			pageCategoryWriter.print(buff);
		}
	}

	void textAndSections(String text, String title, int wikiID) {
		try {
			WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance();
			String[] prefixes = {filePrefix, imagePrefix};
			ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes);

			// Text (MOVED)
			/*String page = tokenizedText(parsedPage, title);
			synchronized (this) {
				textWriter.println(page);
			} */

			// Sections
			Matcher matcher;
			StringBuilder sb = new StringBuilder();
			String sectionTitle;
			for (Section section : parsedPage.getSections()) {
				sectionTitle = section.getTitle();
				// logger.debug(sectionTitle);
				if (sectionTitle != null && sectionTitle.trim().length() > 0) {
					if (sectionTitleSkipPattern != null) {
						matcher = sectionTitleSkipPattern.matcher(sectionTitle);
						if (matcher.find()) {
							continue;
						}
					}

					sb.append(title);
					sb.append(CharacterTable.HORIZONTAL_TABULATION);
					sb.append(sectionTitle);
					sb.append(CharacterTable.LINE_FEED);
				}
			}
			synchronized (this) {
				sectionTitleWriter.print(sb.toString());
			}
		} catch (Exception e) {
			logger.error("Error processing page " + title + " (" + wikiID + ")");
		}
	}

	/**
	 * Returns the whole content of the page tokenized in a single line.
	 * The first token is the page title (with underscores)
	 */
	/*private String tokenizedText(ParsedPage parsedPage, String title) throws IOException {
		StringBuilder sb = new StringBuilder();
		sb.append(title);
		sb.append(CharacterTable.SPACE);
		Tokenizer tokenizer = HardTokenizer.getInstance();
		String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE));
		sb.append(tokenizedTitle);
		String rawContent;
		String tokenizedContent;
		List list;
		for (Section section : parsedPage.getSections()) {
			list = section.getContentList();
			for (int i = 0; i < list.size(); i++) {
				rawContent = list.get(i).getText();
				if (rawContent.length() > 0) {
					tokenizedContent = tokenizer.tokenizedString(rawContent);
					if (tokenizedContent.length() > 0) {
						sb.append(CharacterTable.SPACE);
						sb.append(tokenizedContent);
					}
				}
			}
		}
		return sb.toString();
	} */


	void personInfo(String text, String title) {
		//todo: check why here!!!
		String birthYear = null, deathsYear = null;
		String name = null, surname = null;
		String templateText = null;
		Matcher templateMatcher, birthDateMatcher, deathDateMatcher, nameMatcher, surnameMatcher;

		// extract the name
		templateMatcher = templatePattern.matcher(text);
		if (templateMatcher.find()) {
			templateText = templateMatcher.group(1);
			//logger.debug(title + "\t" + templateText);
		}

		if (templateText == null) {
			return;
		}

		templateText = templateText.trim();
		//logger.debug(templateText);

		// extract the name
		nameMatcher = namePattern.matcher(templateText);
		if (nameMatcher.find()) {
			name = nameMatcher.group(1).trim();
			//logger.debug(title + "\t'" + name + "'");
		}

		// extract the surname
		surnameMatcher = surnamePattern.matcher(templateText);
		if (surnameMatcher.find()) {
			surname = surnameMatcher.group(1).trim();
			//logger.debug(title + "\t'" + surname + "'");
		}

		// extract the birth date
		if (birthDatePattern != null) {
			birthDateMatcher = birthDatePattern.matcher(templateText);
			if (birthDateMatcher.find()) {
				birthYear = birthDateMatcher.group(1).trim();
				//logger.debug(title + "\t'" + birthYear + "'");
			}
		}

		if (deathDatePattern != null) {
			deathDateMatcher = deathDatePattern.matcher(templateText);
			if (deathDateMatcher.find()) {
				deathsYear = deathDateMatcher.group(1).trim();
				//logger.debug(title + "\t'" + deathsYear + "'");
			}
		}

		//if (name != null && surname != null && birthYear != null)
		if (name != null && surname != null) {
			StringBuilder sb = new StringBuilder();
			sb.append(title);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(name.trim());
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(surname.trim());
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			if (birthYear != null) {
				sb.append(birthYear.trim());
			}
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			// extract the death date if it exists
			if (deathsYear != null) {
				sb.append(deathsYear.trim());
			}
			//personInfoWriter.print("\n");
			// personInfoWriter.flush();
			synchronized (this) {
				personInfoWriter.println(sb.toString());
				/*personInfoWriter.print(title);
				personInfoWriter.print("\t");
				personInfoWriter.print(name.trim());
				personInfoWriter.print("\t");
				personInfoWriter.print(surname.trim());
				personInfoWriter.print("\t");
				if (birthYear != null) {
					personInfoWriter.print(birthYear.trim());
				}
				personInfoWriter.print("\t");
				// extract the death date if it exists
				if (deathsYear != null) {
					personInfoWriter.print(deathsYear.trim());
				}
				personInfoWriter.print("\n");
				// personInfoWriter.flush(); */
			}
		}
	}

	private void writeTitlePage(String title, int wikiID) {
		StringBuilder sb = new StringBuilder();
		sb.append(title);
		sb.append(CharacterTable.HORIZONTAL_TABULATION);
		sb.append(wikiID);

		synchronized (this) {
			titleIdWriter.println(sb.toString());
		}
	}

	void writeTitleContentPage(String title, int wikiID) {
		StringBuilder sb = new StringBuilder();
		sb.append(title);
		sb.append(CharacterTable.HORIZONTAL_TABULATION);
		sb.append(wikiID);

		synchronized (this) {
			contentPageTitleWriter.println(sb.toString());
		}
	}

	/*
	void crossLanguage(String text, String title) {
		StringBuilder buffer = new StringBuilder();
		Matcher m = crossLanguagePattern.matcher(text);
		buffer.append(title);

		while (m.find()) {
			int s = m.start(1);
			int e = m.end(1);
			String foreignPage = text.substring(s, e).replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
			buffer.append(StringTable.HORIZONTAL_TABULATION);
			buffer.append(foreignPage);
		}

		synchronized (this) {
			crossLanguageWriter.println(buffer.toString());
		}
	}
  */

	@Override
	public void portalPage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
		writeTitlePage(title, wikiID);
		synchronized (this) {
			disambiguationWriter.println(title);
		}
	}

	void analysis() {
		synchronized (this) {
			analysisWriter.println("date=" + new Date());

			analysisWriter.println("total=" + generalCount);
			analysisWriter.println("content=" + countPageCounter);
			analysisWriter.println("disambiguation=" + disambiguationPageCounter);
			analysisWriter.println("category=" + categoryPageCounter);
			analysisWriter.println("redirect=" + redirectPageCounter);
			analysisWriter.println("template=" + templatePageCounter);
			analysisWriter.println("mediawiki=" + mediawikiPageCounter);
			analysisWriter.println("wikipedia=" + wikipediaPageCounter);
			analysisWriter.println("file=" + filePageCounter);
			analysisWriter.println("special=" + specialPageCounter);
			analysisWriter.println("image=" + imagePageCounter);
			analysisWriter.println("project=" + projectPageCounter);
			analysisWriter.println("other=" + otherPageCounter);
		}
	}

	@Override
	public void endProcess() {
		super.endProcess();
		analysis();
		analysisWriter.close();
		titleIdWriter.close();
		//crossLanguageWriter.close();
		disambiguationWriter.close();
		personInfoWriter.close();
		redirectWriter.close();
		//textWriter.close();
		pageCategoryWriter.close();
		superCategoryWriter.close();
		contentPageTitleWriter.close();

//		templateFreqWriter.println(count + " pages with at least one template");
		templateNameWriter.close();
//		templateFreqWriter.close();
		templateMapWriter.close();
		templateMapWriterWithRepetitions.close();
		templateMapWriterProp.close();

		templateNavigationWriter.close();

	}

	public static void main(String args[]) throws IOException {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);

		Options options = new Options();
		try {
			Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
			Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
			Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + AbstractWikipediaXmlDumpParser.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
			Option numPageOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of pages to process (default all)").withLongOpt("num-pages").create("p");
			Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + AbstractWikipediaExtractor.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");

			options.addOption("h", "help", false, "print this message");
			options.addOption("v", "version", false, "output version information and exit");


			options.addOption(wikipediaDumpOpt);
			options.addOption(outputDirOpt);
			options.addOption(numThreadOpt);
			options.addOption(numPageOpt);
			options.addOption(notificationPointOpt);
			CommandLineParser parser = new PosixParser();
			CommandLine line = parser.parse(options, args);
			logger.debug(line);

			int numThreads = AbstractWikipediaXmlDumpParser.DEFAULT_THREADS_NUMBER;
			if (line.hasOption("num-threads")) {
				numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
			}

			int numPages = AbstractWikipediaExtractor.DEFAULT_NUM_PAGES;
			if (line.hasOption("num-pages")) {
				numPages = Integer.parseInt(line.getOptionValue("num-pages"));
			}

			int notificationPoint = AbstractWikipediaExtractor.DEFAULT_NOTIFICATION_POINT;
			if (line.hasOption("notification-point")) {
				notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
			}

			ExtractorParameters extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
			logger.debug(extractorParameters);
			WikipediaExtractor wikipediaExtractor = new WikipediaPreprocessing(numThreads, numPages, extractorParameters.getLocale());
			wikipediaExtractor.setNotificationPoint(notificationPoint);
			wikipediaExtractor.start(extractorParameters);


		} catch (ParseException e) {
			// oops, something went wrong
			logger.error("Parsing failed: " + e.getMessage() + "\n");
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(200, "java -cp properties:dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaPreprocessing", "\n", options, "\n", true);
		} finally {
			logger.info("extraction ended " + new Date());
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy