All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.WikipediaExampleExtractor Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.wiki.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import eu.fbk.twm.utils.*;
import eu.fbk.twm.utils.analysis.HardTokenizer;
import eu.fbk.twm.utils.analysis.Tokenizer;
import eu.fbk.twm.wiki.xmldump.util.*;
import org.apache.commons.cli.*;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.xerial.snappy.SnappyOutputStream;

import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//todo: check that a form matches [a-z][a-z0-9]*
public class WikipediaExampleExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named WikipediaExampleExtractor.
	 */
	static Logger logger = Logger.getLogger(WikipediaExampleExtractor.class.getName());

	public static final int ID_FORM_INDEX = 0;

	public static final int ID_PAGE_INDEX = 1;

	public static final int FORM_INDEX = 2;

	public static final int PAGE_INDEX = 3;

	public static final int SOURCE_INDEX = 4;

	public static final int TYPE_INDEX = 5;

	public static final int ID_INDEX = 6;

	public static final int LEFT_CONTEXT_INDEX = 7;

	public static final int RIGHT_CONTEXT_INDEX = 8;

	public static final int COLUMN_NUMBER = 9;

	private static AtomicInteger exampleCounter = new AtomicInteger();

	private PrintWriter exampleWriter;

	private PrintWriter pageCounterWriter;

	private PrintWriter formCounterWriter;

	private PrintWriter formIdWriter;

	private PageMap redirectPageMap;

	private PageSet disambiguationPageSet;

	private PageMap contentPageMap;

	private ReversePageMap reverseRedirectPageMap;

	private PersonInfoMap personInformationMap;

	private WikiMarkupParser wikiMarkupParser;

	private Tokenizer tokenizer;

	private Pattern sectionTitleSkipPattern;

	private SynchronizedCounter formCounter;

	private SynchronizedCounter pageCounter;

	private SynchronizedIndexer formIndexer;

	private int maximumNumberOfExamplesPerPage;

	public WikipediaExampleExtractor(int numThreads, int numPages, Locale locale) throws IOException {
		super(numThreads, numPages, locale);

		//SECTION_TITLE_SKIP_PATTERN
		if (resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
			sectionTitleSkipPattern = Pattern.compile(resources.getString("SECTION_TITLE_SKIP_PATTERN"), Pattern.CASE_INSENSITIVE);
		}

		tokenizer = HardTokenizer.getInstance();
		wikiMarkupParser = WikiMarkupParser.getInstance();
		maximumNumberOfExamplesPerPage = DEFAULT_MAXIMUM_FORM_FREQ;
	}

	public int getMaximumNumberOfExamplesPerPage() {
		return maximumNumberOfExamplesPerPage;
	}

	public void setMaximumNumberOfExamplesPerPage(int maximumNumberOfExamplesPerPage) {
		this.maximumNumberOfExamplesPerPage = maximumNumberOfExamplesPerPage;
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		// String redirectFile, String disambiguationFile, String titleFile, String peopleFile
		try {
			redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
			logger.info(redirectPageMap.size() + " redirect pages");

			reverseRedirectPageMap = new ReversePageMap(new File(extractorParameters.getWikipediaRedirFileName()));
			logger.info(reverseRedirectPageMap.size() + " reverse redirect pages");

			disambiguationPageSet = new PageSet(new File(extractorParameters.getWikipediaDisambiguationFileName()));
			logger.info(disambiguationPageSet.size() + " disambiguation pages");

			contentPageMap = new PageMap(new File(extractorParameters.getWikipediaContentPageFileName()));
			//contentPageMap = new PageMap(new File(extractorParameters.getWikipediaTitleIdFileName()));
			logger.info(contentPageMap.size() + " content pages");

			personInformationMap = new PersonInfoMap(new File(extractorParameters.getWikipediaPersonInfoFileName()));
			logger.info(personInformationMap.size() + " person information");

			logger.info("example file: " + extractorParameters.getWikipediaExampleFileName());

			if (isCompress()) {
				logger.info(extractorParameters.getWikipediaExampleFileName() + " is compressed");
				exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new SnappyOutputStream(new FileOutputStream(extractorParameters.getWikipediaExampleFileName())), "UTF-8")));
			}
			else {
				exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaExampleFileName()), "UTF-8")));
			}

			logger.info("form/freq file: " + extractorParameters.getWikipediaFormFreqFileName());
			formCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormFreqFileName()), "UTF-8")));

			logger.info("page/freq file: " + extractorParameters.getWikipediaPageFreqFileName());
			pageCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageFreqFileName()), "UTF-8")));

			logger.info("form/index file: " + extractorParameters.getWikipediaFormIdFileName());
			formIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormIdFileName()), "UTF-8")));


			formIndexer = new SynchronizedIndexer();
			formCounter = new SynchronizedCounter();
			pageCounter = new SynchronizedCounter();

		} catch (IOException e) {
			logger.error(e);
		}
		startProcess(extractorParameters.getWikipediaXmlFileName());
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
		try {
			//	logger.debug(title);
			ExampleBuilder exampleBuilder = new ExampleBuilder(text, title, true);
			List list = exampleBuilder.getExampleList();

			StringBuilder sb = new StringBuilder();
			Example example;
			for (int i = 0; i < list.size(); i++) {
				example = list.get(i);

				if (!example.isEmpty()) {
					sb.append(example.toString(exampleCounter.incrementAndGet()));
					sb.append(CharacterTable.LINE_FEED);
				}
			}

			synchronized (this) {
				exampleWriter.print(sb.toString());
			}
		} catch (Exception e) {
			logger.error(e);
		}
	}

	@Override
	public void printLog() {
		if (printHeader) {
			logger.info("total\tcontent\tredirect\tdisambiguation\tcategory\tpage\tform\ttime\tdate");
			printHeader = false;
		}
		logger.info(decimalFormat.format(generalCount.intValue()) + "\t" + decimalFormat.format(countPageCounter) + "\t" + decimalFormat.format(redirectPageCounter) + "\t" + decimalFormat.format(disambiguationPageCounter) + "\t" + decimalFormat.format(categoryPageCounter) + "\t" + decimalFormat.format(pageCounter.size()) + "\t" + "\t" + decimalFormat.format(formCounter.size()) + "\t" + decimalFormat.format(genEnd.longValue() - genBegin.longValue()) + "\t" + new Date());
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {
		try {
			//	logger.debug(title);
			ExampleBuilder exampleBuilder = new ExampleBuilder(text, title, false);
			List list = exampleBuilder.getExampleList();

			StringBuilder sb = new StringBuilder();
			Example example;
			for (int i = 0; i < list.size(); i++) {
				example = list.get(i);

				if (!example.isEmpty()) {
					sb.append(example.toString(exampleCounter.incrementAndGet()));
					sb.append(CharacterTable.LINE_FEED);
				}
			}

			synchronized (this) {
				exampleWriter.print(sb.toString());
			}
		} catch (Exception e) {
			logger.error("Error at page " + title + " (" + wikiID + ")");
			logger.error(e);
		}
	}

	/**
	 * This class is designed for building an Example.
	 */
	class ExampleBuilder {
		public final static String EMPTY_CONTEXT = "";

		public final static String END_OF_SENTENCE = ". ";

		List exampleList;

		boolean nominal;

		//boolean disambiguation;

		ExampleBuilder(String text, String title, boolean disambiguation) throws IOException {
			exampleList = new ArrayList();
			//this.disambiguation = disambiguation;
			//ParsedPage parsedPage = wikiMarkupParser.parsePage(text);
			String[] prefixes = {filePrefix, imagePrefix};
			ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes);
			ParsedPageTitle parsedPageTitle = new ParsedPageTitle(title);
			PageTypeExtractor pageTypeExtractor = new PageTypeExtractor(text, parsedPageTitle.getForm());
			nominal = pageTypeExtractor.isNominal();
			if (!disambiguation) {
				addPageExamples(parsedPage, parsedPageTitle);
				addTextExample(parsedPage, parsedPageTitle);
				addCategoryExamples(text, parsedPageTitle);
				addSectionTitleExamples(parsedPage, parsedPageTitle);
				//it's added in addPageExamples as left context
				//addSuffixExample(parsedPageTitle);
			}
			addLinkExamples(parsedPage, parsedPageTitle);
			//addTemplateExamples(text, parsedPageTitle);
		}

		public List getExampleList() {
			return exampleList;
		}

		/*private void buildOtherExamples(String form, String page, String source, String leftContext, String rightContext, String type) {
			Example example = new Example(form, page, source, leftContext, rightContext, type);
			addExample(example);
			addPersonSurnameExample(example);
			//addRedirectLinkExamples(example);
		} */

		private void addExample(Example example) {
			//todo: change with pageCounter no more than maximumNumberOfExamplesPerPage examples per sense not per form?
			//if (formCounter.get(example.getForm()) <= maximumNumberOfExamplesPerPage) {
			if (pageCounter.get(example.getPage()) <= maximumNumberOfExamplesPerPage) {
				exampleList.add(example);
			}
		}

		private void addSuffixExample(ParsedPageTitle parsedPageTitle) {

			if (parsedPageTitle.hasSuffix()) {
				String rightContext = parsedPageTitle.getSuffix();
				Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_TITLE_SUFFIX);
				addExample(example);
				addNominalVariantExample(example);
				addPersonSurnameExample(example);
				addRedirectLinkExamples(example);
			}
		}

		/**
		 * Adds the section examples
		 */
		private void addSectionTitleExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
		{
			Example example;
			String sectionTitle;
			String rightContext;
			Matcher matcher;
			for (Section section : parsedPage.getSections()) {
				try {
					sectionTitle = section.getTitle();
					if (sectionTitle != null) {
						matcher = sectionTitleSkipPattern.matcher(sectionTitle);
						if (!matcher.matches()) {
							rightContext = sectionTitle;
							//todo: check  if this disambiguation check can be removed, it should have been already done
							if (!disambiguationPageSet.contains(parsedPageTitle.getPage())) {
								//buildOtherExamples(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_SECTION_TITLE);
								if (parsedPageTitle.isCompliant()) {
									example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_SECTION_TITLE);
									addExample(example);
									addNominalVariantExample(example);
									addPersonSurnameExample(example);
									addRedirectLinkExamples(example);
									//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
								}
								else {
									//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
								}
							}

						}
					}
				} catch (Exception ex) {
					logger.error("Exception adding section examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
				}
			}
		}

		private void addTemplateExamples(String text, ParsedPageTitle parsedPageTitle) {
			ArrayList templateList = WikiTemplateParser.parse(text, false);
			logger.debug(parsedPageTitle.getPage());
			for (WikiTemplate t : templateList) {
				logger.debug(t.getHashMapOfParts());
			}
		}

		/**
		 * Adds as example the text extracted from the page.
		 *
		 * @param parsedPage      the text
		 * @param parsedPageTitle the parsed page title
		 * @throws IOException
		 */
		private void addTextExample(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
		{
			try {
				if (parsedPageTitle.isCompliant()) {
					Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, parsedPage.getText(), Example.CONTENT_FROM_TEXT);
					exampleList.add(example);
					addNominalVariantExample(example);
					addPersonSurnameExample(example);
					addRedirectLinkExamples(example);
					//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
				}
				else {
					//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
				}

			} catch (Exception ex) {
				logger.error("Exception adding text examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
			}
		}

		private void addCategoryExamples(String text, ParsedPageTitle parsedPageTitle) {
			Matcher m = categoryPattern.matcher(text);
			Example example;
			String category;
			int s, e;
			for (; m.find(); ) {
				try {
					s = m.start(2);
					e = m.end(2);
					category = text.substring(s, e);
					int j = category.indexOf(CharacterTable.VERTICAL_LINE);
					if (j != -1) {
						category = category.substring(0, j);
					}
					if (!category.equals(parsedPageTitle.getForm()) && !disambiguationPageSet.contains(parsedPageTitle.getPage())) {
						//buildOtherExamples(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), category, EMPTY_CONTEXT, Example.CONTENT_FROM_CATEGORY);
						if (parsedPageTitle.isCompliant()) {
							example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, category, Example.CONTENT_FROM_CATEGORY);
							exampleList.add(example);
							addNominalVariantExample(example);
							addPersonSurnameExample(example);
							addRedirectLinkExamples(example);
							//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
						}
						else {
							//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
						}
					}
				} catch (Exception ex) {
					logger.error("Exception adding category examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
				}
			}
		}

		/**
		 * Adds examples extracted from the page links.
		 *
		 * @param parsedPage      the parsed page
		 * @param parsedPageTitle the parsed page title
		 * @throws IOException
		 */
		private void addLinkExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
		{
			Example example;
			String redirectPage;
			String secondRedirectPage;
			List internalLinks;
			ParsedPageLink parsedPageLink = null;
			ParsedPageTitle parsedLinkTitle = null;
			String sectionTitle;
			for (Section section : parsedPage.getSections()) {
				internalLinks = section.getLinks(Link.type.INTERNAL);
				sectionTitle = section.getTitle();
				for (Link link : internalLinks) {
					try {
						parsedPageLink = new ParsedPageLink(link);
						if (parsedPageLink.isCompliant()) {
							redirectPage = redirectPageMap.get(parsedPageLink.getPage());
							//todo: check multiple redirects
							if (redirectPage != null) {
								secondRedirectPage = redirectPageMap.get(redirectPage);
								if (secondRedirectPage != null) {
									logger.warn(parsedPageLink.getPage() + " -> " + redirectPage + " -> " + secondRedirectPage);
									//logger.error("exit on a second redirect!");
									//System.exit(0);
									parsedPageLink.setPage(secondRedirectPage);
								}
								else {
									parsedPageLink.setPage(redirectPage);
								}

							}
							//todo: comment first if used on the whole dump
							//if (!disambiguationPageSet.contains(parsedPageLink.getPage())) {
							if (contentPageMap.get(parsedPageLink.getPage()) != null && !disambiguationPageSet.contains(parsedPageLink.getPage())) {
								parsedLinkTitle = new ParsedPageTitle(parsedPageLink.getPage());
								//buildOtherExamples(parsedPageLink.getForm(), parsedPageLink.getPage(), parsedPageTitle.getPage(), buildLeftContext(parsedPageLink, parsedPageTitle, parsedLinkTitle, sectionTitle), parsedPageLink.getRightContext(), Example.CONTENT_FROM_LINK);
								example = new Example(parsedPageLink.getForm(), parsedPageLink.getPage(), parsedPageTitle.getPage(), buildLeftContext(parsedPageLink, parsedPageTitle, parsedLinkTitle, sectionTitle), parsedPageLink.getRightContext(), Example.CONTENT_FROM_LINK);
								addExample(example);
								addPersonSurnameExample(example);
								// it introduces noise into page/forms mapping
								//addRedirectLinkExamples(example);

							}
							//logger.debug(parsedPageLink.getForm() + "\t" + parsedPageLink.getPage() + "\tKEEP");
						}
						else {
							//logger.warn(parsedPageLink.getForm() + "\t" + parsedPageLink.getPage() + "\tREMOVED");
						}
					} catch (Exception ex) {
						logger.error("Exception adding link examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
					}
				}
			}
		}

		/**
		 * Creates the left context introduced by the page title (form), section title (if any) the link title (form,
		 * if different)
		 *
		 * @param parsedPageLink  the parsed page link
		 * @param parsedPageTitle the parsed page title
		 * @param parsedLinkTitle the parsed page title of the link
		 * @param sectionTitle    the section  title
		 * @return the left context
		 */
		private String buildLeftContext(ParsedPageLink parsedPageLink, ParsedPageTitle parsedPageTitle, ParsedPageTitle parsedLinkTitle, String sectionTitle) {
			StringBuilder leftContextBuilder = new StringBuilder();
			// add the page title (form)
			leftContextBuilder.append(parsedPageTitle.getForm());
			leftContextBuilder.append(CharacterTable.SPACE);

			if (sectionTitle != null) {
				if (!sectionTitleSkipPattern.matcher(sectionTitle).find()) {
					// add the section title
					leftContextBuilder.append(sectionTitle);
					leftContextBuilder.append(CharacterTable.SPACE);
				}
			}
			parsedLinkTitle = new ParsedPageTitle(parsedPageLink.getPage());
			if (!parsedPageLink.getForm().equals(parsedLinkTitle.getForm())) {
				// add the linked page title (form)
				leftContextBuilder.append(parsedLinkTitle.getForm());
				leftContextBuilder.append(CharacterTable.SPACE);
				if (parsedLinkTitle.hasSuffix()) {
					// add the linked page title's suffix (form)
					leftContextBuilder.append(parsedLinkTitle.getSuffix());
					leftContextBuilder.append(CharacterTable.SPACE);
				}
			}
			leftContextBuilder.append(END_OF_SENTENCE);
			leftContextBuilder.append(parsedPageLink.getLeftContext());
			return leftContextBuilder.toString();
		}

		private boolean isLowerCase(String s) {
			/*for (int i = 0; i < s.length(); i++) {
				if (Character.isUpperCase(s.charAt(i))) {
					return false;
				}
			}
			return true;*/
			if (Character.isLowerCase(s.charAt(0))) {
				return true;
			}

			return false;
		}

		private void addRedirectLinkExamples(Example example) {
			//System.out.println("O\t" + example);
			Set redirectSet = reverseRedirectPageMap.get(example.getPage());
			Set formSet = new HashSet();
			formSet.add(example.getForm());
			if (redirectSet != null) {
				Iterator it = redirectSet.iterator();
				String redirectTitle;
				ParsedPageTitle redirectParsedPageTitle;
				String redirectForm;
				Example redirectExample;
				for (; it.hasNext(); ) {
					try {
						redirectTitle = it.next();
						redirectParsedPageTitle = new ParsedPageTitle(redirectTitle);
						if (redirectParsedPageTitle.isCompliant()) {
							redirectForm = redirectParsedPageTitle.getForm();
							if (isLowerCase(example.getForm())) {
								redirectForm.toLowerCase();
							}
							//if (!example.getForm().equals(redirectForm)) {
							if (!formSet.contains(redirectForm)) {
								redirectExample = new Example(redirectForm, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
								//System.out.println("R\t" + redirectExample);
								addExample(redirectExample);
								formSet.add(redirectForm);
								addNominalVariantExample(redirectExample);
							}
						}
					} catch (Exception ex) {
						logger.error("Exception adding redirect link examples (" + exampleCounter.intValue() + ")\n" + ex);
					}
				}
			}

		}

		private void addPageExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
		{
			try {
				if (!disambiguationPageSet.contains(parsedPageTitle.getPage())) {
					Section firstSection = parsedPage.getSection(0);
					String leftContext = EMPTY_CONTEXT;
					String rightContext = EMPTY_CONTEXT;

					if (firstSection != null) {
						rightContext = firstSection.getText();
					}
					if (parsedPageTitle.hasSuffix()) {
						leftContext = parsedPageTitle.getSuffix();
					}

					if (parsedPageTitle.isCompliant()) {
						Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), leftContext, rightContext, Example.CONTENT_FROM_PAGE);
						addExample(example);
						addNominalVariantExample(example);
						addPersonSurnameExample(example);
						addRedirectPageExamples(example, parsedPageTitle);
						//logger.debug(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
					}
					else {
						//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
					}
				}
			} catch (Exception ex) {
				logger.error("Exception adding page examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
			}
		}

		/**
		 * Adds alternative example created from the page title by using the redirect pages.
		 *
		 * @param example         the original example
		 * @param parsedPageTitle the source parsed page title
		 * @return the list of alternative examples
		 */
		private void addRedirectPageExamples(Example example, ParsedPageTitle parsedPageTitle) {
			String leftContext = EMPTY_CONTEXT;
			Set redirectSet = reverseRedirectPageMap.get(parsedPageTitle.getPage());
			if (redirectSet != null) {
				Iterator it = redirectSet.iterator();
				String redirectTitle;
				ParsedPageTitle redirectParsedPageTitle;
				Example redirectExample;
				for (; it.hasNext(); ) {
					try {
						redirectTitle = it.next();
						redirectParsedPageTitle = new ParsedPageTitle(redirectTitle);
						if (redirectParsedPageTitle.hasSuffix()) {
							leftContext = redirectParsedPageTitle.getSuffix();
						}
						if (redirectParsedPageTitle.isCompliant()) {
							redirectExample = new Example(redirectParsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), leftContext, example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
							addExample(redirectExample);
							addNominalVariantExample(redirectExample);
						}
						//logger.debug(redirectParsedPageTitle.getForm() + "\t" + example.getForm());
					} catch (Exception ex) {
						logger.error("Exception adding redirect page examples (" + exampleCounter.intValue() + ")\n" + ex);
					}
				}
			}
		}

		private void addPersonSurnameExample(Example example) {
			PersonInfoMap.Person person = personInformationMap.get(example.getPage());
			if (person != null) {

				try {
					String surname = person.getSurname();
					if (surname.length() > 0) {
						Example surnameExample = new Example(surname, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_PERSON_INFORMATION);
						int fc = 0;
						if ((fc = formCounter.get(surname)) <= maximumNumberOfExamplesPerPage) {
							exampleList.add(surnameExample);
						}
					}
				} catch (Exception ex) {
					logger.error("Exception adding person info examples (" + exampleCounter.intValue() + ")\n" + ex);
				}
			}
		}

		/**
		 * Adds the form in lowercase if it's a nom.
		 */
		private void addNominalVariantExample(Example example) {
			if (nominal) {
				String form = example.getForm().toLowerCase();
				Example formExample = new Example(form, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_NOMINAL);
				int fc = 0;
				if ((fc = formCounter.get(form)) <= maximumNumberOfExamplesPerPage) {
					exampleList.add(formExample);
				}
			}
		}
	}

	class Example {
		public static final String CONTENT_FROM_PERSON_INFORMATION = "I";

		public static final String CONTENT_FROM_REDIRECTION_PAGE = "R";

		public static final String CONTENT_FROM_LINK = "L";

		public static final String CONTENT_FROM_PAGE = "P";

		public static final String CONTENT_FROM_CATEGORY = "C";

		public static final String CONTENT_FROM_SECTION_TITLE = "S";

		public static final String CONTENT_FROM_NOMINAL = "N";

		public static final String CONTENT_FROM_TEXT = "T";

		public static final String CONTENT_FROM_TITLE_SUFFIX = "U";
		private String type;

		private String source;

		private String page;

		private String leftContext;

		private String form;

		private String rightContext;

		Example(String form, String page, String source, String leftContext, String rightContext, String type) {
			//todo:remove from here
			formCounter.add(form);
			pageCounter.add(page);
			this.form = form;
			this.source = source;
			this.page = page;//normalizePageName(page);
			this.leftContext = leftContext;
			this.rightContext = rightContext;
			this.type = type;
		}

		public String getType() {
			return type;
		}

		public void setType(String type) {
			this.type = type;
		}

		public String getSource() {
			return source;
		}

		public void setSource(String source) {
			this.source = source;
		}

		public String getPage() {
			return page;
		}

		public void setPage(String page) {
			this.page = page;
		}

		public String getLeftContext() {
			return leftContext;
		}

		public void setLeftContext(String leftContext) {
			this.leftContext = leftContext;
		}

		public String getForm() {
			return form;
		}

		public void setForm(String form) {
			this.form = form;
		}

		public String getRightContext() {
			return rightContext;
		}

		public void setRightContext(String rightContext) {
			this.rightContext = rightContext;
		}

		public boolean isEmpty() {
			if (form == null) {
				return true;
			}
			if (form.length() == 0) {
				return true;
			}
			if (page == null) {
				return true;
			}
			if (page.length() == 0) {
				return true;
			}
			if (source == null) {
				return true;
			}
			if (source.length() == 0) {
				return true;
			}
			if (leftContext == null) {
				return true;
			}
			if (rightContext == null) {
				return true;
			}

			if (leftContext.length() == 0 && rightContext.length() == 0) {
				return true;
			}

			return false;
		}

		public String toString() {
			return toString(0);
		}

		public String toString(int count) {
			StringBuilder sb = new StringBuilder();

			String tokenizedForm = tokenizer.tokenizedString(form);
			int formIndex = formIndexer.get(tokenizedForm);
			String pageIndex = contentPageMap.get(page);
			sb.append(formIndex);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(pageIndex);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);

			sb.append(tokenizedForm);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(page);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(source);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(count);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(type);
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(tokenizer.tokenizedString(leftContext));
			sb.append(CharacterTable.HORIZONTAL_TABULATION);
			sb.append(tokenizer.tokenizedString(rightContext));
			return sb.toString();
		}

	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void templatePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void portalPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void endProcess() {
		super.endProcess();
		try {
			logger.info("writing " + decimalFormat.format(formCounter.size()) + " forms (counter)...");
			writeFormCounter();
			formCounterWriter.close();
		} catch (IOException e) {
			logger.error(e);
		}
		try {
			logger.info("writing " + decimalFormat.format(pageCounter.size()) + " pages (counter)...");
			pageCounter.write(pageCounterWriter);
			pageCounterWriter.close();
		} catch (IOException e) {
			logger.error(e);
		}
		try {
			logger.info("writing " + decimalFormat.format(formIndexer.size()) + " forms (indexer)...");
			formIndexer.write(formIdWriter);
			formIdWriter.close();
		} catch (IOException e) {
			logger.error(e);
		}
		logger.debug("closing the output stream...");
		exampleWriter.close();
	}

	public void writeFormCounter() throws IOException {
		SortedMap> sortedMap = formCounter.getSortedMap();
		Iterator it = sortedMap.keySet().iterator();
		AtomicInteger i;
		for (; it.hasNext(); ) {
			i = it.next();
			List list = sortedMap.get(i);
			for (int j = 0; j < list.size(); j++) {
				formCounterWriter.print(i.toString());
				formCounterWriter.print(CharacterTable.HORIZONTAL_TABULATION);
				formCounterWriter.println(tokenizer.tokenizedString(list.get(j).toString()));
			}
		}
	}

	public static void main(String args[]) throws IOException {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);

		Options options = new Options();
		try {
			Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
			Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
			Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
			Option numPageOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of pages to process (default all)").withLongOpt("num-pages").create("p");
			Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + Defaults.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");
			Option maximumFormFreqOpt = OptionBuilder.withArgName("max-freq").hasArg().withDescription("maximum frequency of wanted forms (default is " + WikipediaExtractor.DEFAULT_MAXIMUM_FORM_FREQ + ")").withLongOpt("max-freq").create("m");
			options.addOption("h", "help", false, "print this message");
			options.addOption("v", "version", false, "output version information and exit");
			Option baseDirOpt = OptionBuilder.withDescription("if set, use the output folder as base dir").withLongOpt("base-dir").create();

			options.addOption(wikipediaDumpOpt);
			options.addOption(outputDirOpt);
			options.addOption(numThreadOpt);
			options.addOption(numPageOpt);
			options.addOption(notificationPointOpt);
			options.addOption(maximumFormFreqOpt);
			options.addOption(baseDirOpt);
			CommandLineParser parser = new PosixParser();
			CommandLine line = parser.parse(options, args);


			int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
			if (line.hasOption("num-threads")) {
				numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
			}

			int numPages = Defaults.DEFAULT_NUM_PAGES;
			if (line.hasOption("num-pages")) {
				numPages = Integer.parseInt(line.getOptionValue("num-pages"));
			}

			int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
			if (line.hasOption("notification-point")) {
				notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
			}

			//ExtractorParameters extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
			ExtractorParameters extractorParameters;
			if (line.hasOption("base-dir")) {
				extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"), true);
			}
			else {
				extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
			}
			File dest = new File(extractorParameters.getExtractionOutputDirName());

			if (dest.mkdirs()) {
				logger.info(dest + " created");
			}

			logger.debug(extractorParameters);
			int maximumFormFreq = WikipediaExampleExtractor.DEFAULT_MAXIMUM_FORM_FREQ;
			if (line.hasOption("max-freq")) {
				maximumFormFreq = Integer.parseInt(line.getOptionValue("max-freq"));
			}
			logger.debug("filtering examples with frequency higher than " + maximumFormFreq + "...");

			logger.debug("extracting examples (" + extractorParameters.getWikipediaExampleFileName() + ")...");
			WikipediaExampleExtractor wikipediaExtractor = new WikipediaExampleExtractor(numThreads, numPages, extractorParameters.getLocale());
			wikipediaExtractor.setNotificationPoint(notificationPoint);
			wikipediaExtractor.setMaximumNumberOfExamplesPerPage(maximumFormFreq);
			wikipediaExtractor.start(extractorParameters);

			logger.info("extraction ended " + new Date());

		} catch (ParseException e) {
			// oops, something went wrong
			System.out.println("Parsing failed: " + e.getMessage() + "\n");
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExampleExtractor", "\n", options, "\n", true);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy