eu.fbk.twm.wiki.xmldump.WikipediaSectionExtractor Maven / Gradle / Ivy

Go to download
/*
 * Copyright (2015) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.wiki.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.SectionContainer;
import eu.fbk.twm.utils.*;
import eu.fbk.twm.utils.analysis.HardTokenizer;
import eu.fbk.twm.utils.analysis.Tokenizer;
import eu.fbk.twm.wiki.xmldump.util.WikiMarkupParser;
import org.apache.commons.cli.*;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Extracts Section data from pages in the following formats (tab separated CSV):
 * 1) PAGE_TITLE SECTION_TITLE SECTION_TEXT
 * 2) PAGE_TITLE SECTION_COUNT
 *
 * @see eu.fbk.twm.index.PageSectionTextIndexer
 */
public class WikipediaSectionExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
	static Logger logger = Logger.getLogger(WikipediaSectionExtractor.class.getName());

	private PrintWriter sectionWriter;
	private PrintWriter sectionPerPageFrequencyWriter;
	private PrintWriter sectionTitleFrequencyWriter;
	private Pattern sectionSkipPattern;
	private SynchronizedCounter sectionTitleCounter;

	public WikipediaSectionExtractor(int numThreads, int numPages, Locale locale) {
		super(numThreads, numPages, locale);
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		sectionWriter = getWriterForFileName(extractorParameters.getWikipediaSectionFileName());
		sectionPerPageFrequencyWriter = getWriterForFileName(extractorParameters.getWikipediaPageSectionFreqFileName());
		sectionTitleFrequencyWriter = getWriterForFileName(extractorParameters.getWikipediaSectionFreqFileName());
		if (resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
			sectionSkipPattern = Pattern.compile(resources.getString("SECTION_TITLE_SKIP_PATTERN"), Pattern.CASE_INSENSITIVE);
		}
		sectionTitleCounter = new SynchronizedCounter();
		startProcess(extractorParameters.getWikipediaXmlFileName());
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {
		try {
			ArrayList s = sectionData(text, title);
			String outputSections = StringUtils.join(s, CharacterTable.LINE_FEED) + CharacterTable.LINE_FEED;
			String outputFrequency = title + CharacterTable.HORIZONTAL_TABULATION + s.size() + CharacterTable.LINE_FEED;
			synchronized (this) {
				sectionWriter.print(outputSections);
				//sectionWriter.print(CharacterTable.LINE_FEED);
				sectionPerPageFrequencyWriter.print(outputFrequency);
			}
		} catch (IOException e) {
			logger.error(e);
		}
	}

	/**
	 * Process a section data from the raw page input
	 */
	public ArrayList sectionData(String text, String title) throws IOException {
		logger.trace("Processing page: " + title);
		ParsedPage parsedPage = WikiMarkupParser.getInstance().parsePage(text);

		ArrayList sections = new ArrayList<>();
		for (Section section : parsedPage.getSections()) {
			appendSection(title, section, null, sections);
		}

		return sections;
	}

	public static final int MAX_SECTION_LEVEL = 3;

	private void appendSection(String pageTitle, Section section, String titlePrefix, ArrayList accumulator) {
		String sectionTitle = section.getTitle();
		if (sectionTitle == null) {
			//If it's a non-empty section — consider it being a summary
			if (section.nrOfParagraphs() == 0) {
				return;
			}

			if (titlePrefix == null) {
				sectionTitle = "Abstract";
			}
			else {
				sectionTitle = "";
			}
		}
		else {
			if (sectionTitle.trim().length() == 0 || shouldBeSkipped(sectionTitle)) {
				return;
			}
		}

		if (titlePrefix != null) {
			if (sectionTitle.length() == 0) {
				sectionTitle = titlePrefix;
			}
			else {
				sectionTitle = titlePrefix + "." + sectionTitle;
			}
		}

		//If we hasn't reached leaves or a maximum level — continue recursively processing sections
		if (section instanceof SectionContainer) {
			//We need to cast it to SectionContainer since we need subsections that doesn't exist in Section class
			SectionContainer container = (SectionContainer) section;
			if (container.getLevel() < MAX_SECTION_LEVEL && container.nrOfSubSections() > 0) {
				for (Section subsection : container.getSubSections()) {
					appendSection(pageTitle, subsection, sectionTitle, accumulator);
				}
				return;
			}
		}

		Tokenizer tokenizer = HardTokenizer.getInstance();

		//Building clean section text from the parsed section content
		StringBuilder sectionTextBuilder = new StringBuilder();
		for (Content content : section.getContentList()) {
			String rawContent = content.getText();
			if (rawContent.length() == 0) {
				continue;
			}

			String tokenizedContent = tokenizer.tokenizedString(rawContent);
			if (tokenizedContent.length() > 0) {
				sectionTextBuilder.append(CharacterTable.SPACE);
				sectionTextBuilder.append(tokenizedContent);
			}
		}
		String sectionText = sectionTextBuilder.toString();
		String subsectionTitle = section.getTitle() != null ? section.getTitle() : "";
		sectionText = sectionText.trim();
		if (sectionText.startsWith(subsectionTitle)) {
			sectionText = sectionText.substring(subsectionTitle.length());
		}

		//Output format: Page Title \t Section Title \t Section Text
		sectionTitleCounter.add(sectionTitle);
		accumulator.add(pageTitle + CharacterTable.HORIZONTAL_TABULATION + sectionTitle + CharacterTable.HORIZONTAL_TABULATION + sectionText);
	}

	private boolean shouldBeSkipped(String title) {
		if (sectionSkipPattern == null) {
			return false;
		}

		Matcher matcher = sectionSkipPattern.matcher(title);
		return matcher.find();
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
	}

	@Override
	public void templatePage(String text, String title, int wikiID) {
	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
	}

	@Override
	public void portalPage(String text, String title, int wikiID) {
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
	}

	@Override
	public void endProcess() {
		super.endProcess();
		sectionPerPageFrequencyWriter.flush();
		sectionPerPageFrequencyWriter.close();
		sectionWriter.flush();
		sectionWriter.close();

		try {
			logger.info("writing " + decimalFormat.format(sectionTitleCounter.size()) + " section-title frequencies...");
			writeSectionTitleFrequency();
			sectionTitleFrequencyWriter.close();
		} catch (IOException e) {
			logger.error(e);
		}
	}

	public void writeSectionTitleFrequency() throws IOException {
		SortedMap> sortedMap = sectionTitleCounter.getSortedMap();
		Iterator it = sortedMap.keySet().iterator();
		AtomicInteger i;
		for (; it.hasNext(); ) {
			i = it.next();
			List list = sortedMap.get(i);
			for (int j = 0; j < list.size(); j++) {
				sectionTitleFrequencyWriter.print(i.toString());
				sectionTitleFrequencyWriter.print(CharacterTable.HORIZONTAL_TABULATION);
				//sectionTitleFrequencyWriter.println(tokenizer.tokenizedString(list.get(j).toString()));
				sectionTitleFrequencyWriter.println(list.get(j).toString());
			}
		}
	}
	public static void main(String argv[]) throws IOException {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}


		Properties defaultProps = new Properties();
		defaultProps.load(new InputStreamReader(new FileInputStream(logConfig), "UTF-8"));

		Options options = new Options();
		try {
			Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
			Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
			Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER
                    + ")").withLongOpt("num-threads").create("t");
			Option numPageOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of pages to process (default all)").withLongOpt("num-pages").create("p");
			Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + Defaults.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");

			options.addOption("h", "help", false, "print this message");
			options.addOption("v", "version", false, "output version information and exit");
			options.addOption(null, "info", false, "Switch log level to info");
			options.addOption(null, "debug", false, "Switch log level to debug");
			options.addOption(null, "trace", false, "Switch log level to trace");

			options.addOption(wikipediaDumpOpt);
			options.addOption(outputDirOpt);
			options.addOption(numThreadOpt);
			options.addOption(numPageOpt);
			options.addOption(notificationPointOpt);

			CommandLineParser parser = new PosixParser();
			CommandLine line = parser.parse(options, argv);

			String logLevel = defaultProps.getProperty("log4j.rootLogger");
			String logOut;
			if (logLevel == null) {
				logLevel = "info,stdout";
			}
			//Selecting default logLevel
			int argumentPoint = logLevel.indexOf(',');
			logOut = logLevel.substring(argumentPoint);
			logLevel = logLevel.substring(0, argumentPoint);
			if (line.hasOption("info")) {
				logLevel = "info";
			}
			if (line.hasOption("debug")) {
				logLevel = "debug";
			}
			if (line.hasOption("trace")) {
				logLevel = "trace";
			}
			defaultProps.setProperty("log4j.rootLogger", logLevel + logOut);

			PropertyConfigurator.configure(defaultProps);

			int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
			if (line.hasOption("num-threads")) {
				numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
			}

			int numPages = Defaults.DEFAULT_NUM_PAGES;
			if (line.hasOption("num-pages")) {
				numPages = Integer.parseInt(line.getOptionValue("num-pages"));
			}

			int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
			if (line.hasOption("notification-point")) {
				notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
			}

			ExtractorParameters parameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
			logger.debug(parameters);

			WikipediaSectionExtractor extractor = new WikipediaSectionExtractor(numThreads, numPages, parameters.getLocale());
			extractor.setNotificationPoint(notificationPoint);
			extractor.start(parameters);

			logger.info("Extraction ended " + new Date());
		} catch (ParseException e) {
			// oops, something went wrong
			System.out.println("Parsing failed: " + e.getMessage() + "\n");
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaSectionExtractor", "\n", options, "\n", true);
		}
	}

}