
eu.fbk.twm.wiki.xmldump.WikipediaExampleExtractor Maven / Gradle / Ivy
/*
* Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.fbk.twm.wiki.xmldump;
import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import eu.fbk.twm.utils.*;
import eu.fbk.twm.utils.analysis.HardTokenizer;
import eu.fbk.twm.utils.analysis.Tokenizer;
import eu.fbk.twm.wiki.xmldump.util.*;
import org.apache.commons.cli.*;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.xerial.snappy.SnappyOutputStream;
import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//todo: check that a form matches [a-z][a-z0-9]*
public class WikipediaExampleExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
/**
* Define a static logger variable so that it references the
* Logger instance named WikipediaExampleExtractor
.
*/
static Logger logger = Logger.getLogger(WikipediaExampleExtractor.class.getName());
public static final int ID_FORM_INDEX = 0;
public static final int ID_PAGE_INDEX = 1;
public static final int FORM_INDEX = 2;
public static final int PAGE_INDEX = 3;
public static final int SOURCE_INDEX = 4;
public static final int TYPE_INDEX = 5;
public static final int ID_INDEX = 6;
public static final int LEFT_CONTEXT_INDEX = 7;
public static final int RIGHT_CONTEXT_INDEX = 8;
public static final int COLUMN_NUMBER = 9;
private static AtomicInteger exampleCounter = new AtomicInteger();
private PrintWriter exampleWriter;
private PrintWriter pageCounterWriter;
private PrintWriter formCounterWriter;
private PrintWriter formIdWriter;
private PageMap redirectPageMap;
private PageSet disambiguationPageSet;
private PageMap contentPageMap;
private ReversePageMap reverseRedirectPageMap;
private PersonInfoMap personInformationMap;
private WikiMarkupParser wikiMarkupParser;
private Tokenizer tokenizer;
private Pattern sectionTitleSkipPattern;
private SynchronizedCounter formCounter;
private SynchronizedCounter pageCounter;
private SynchronizedIndexer formIndexer;
private int maximumNumberOfExamplesPerPage;
public WikipediaExampleExtractor(int numThreads, int numPages, Locale locale) throws IOException {
super(numThreads, numPages, locale);
//SECTION_TITLE_SKIP_PATTERN
if (resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
sectionTitleSkipPattern = Pattern.compile(resources.getString("SECTION_TITLE_SKIP_PATTERN"), Pattern.CASE_INSENSITIVE);
}
tokenizer = HardTokenizer.getInstance();
wikiMarkupParser = WikiMarkupParser.getInstance();
maximumNumberOfExamplesPerPage = DEFAULT_MAXIMUM_FORM_FREQ;
}
public int getMaximumNumberOfExamplesPerPage() {
return maximumNumberOfExamplesPerPage;
}
public void setMaximumNumberOfExamplesPerPage(int maximumNumberOfExamplesPerPage) {
this.maximumNumberOfExamplesPerPage = maximumNumberOfExamplesPerPage;
}
@Override
public void start(ExtractorParameters extractorParameters) {
// String redirectFile, String disambiguationFile, String titleFile, String peopleFile
try {
redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
logger.info(redirectPageMap.size() + " redirect pages");
reverseRedirectPageMap = new ReversePageMap(new File(extractorParameters.getWikipediaRedirFileName()));
logger.info(reverseRedirectPageMap.size() + " reverse redirect pages");
disambiguationPageSet = new PageSet(new File(extractorParameters.getWikipediaDisambiguationFileName()));
logger.info(disambiguationPageSet.size() + " disambiguation pages");
contentPageMap = new PageMap(new File(extractorParameters.getWikipediaContentPageFileName()));
//contentPageMap = new PageMap(new File(extractorParameters.getWikipediaTitleIdFileName()));
logger.info(contentPageMap.size() + " content pages");
personInformationMap = new PersonInfoMap(new File(extractorParameters.getWikipediaPersonInfoFileName()));
logger.info(personInformationMap.size() + " person information");
logger.info("example file: " + extractorParameters.getWikipediaExampleFileName());
if (isCompress()) {
logger.info(extractorParameters.getWikipediaExampleFileName() + " is compressed");
exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new SnappyOutputStream(new FileOutputStream(extractorParameters.getWikipediaExampleFileName())), "UTF-8")));
}
else {
exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaExampleFileName()), "UTF-8")));
}
logger.info("form/freq file: " + extractorParameters.getWikipediaFormFreqFileName());
formCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormFreqFileName()), "UTF-8")));
logger.info("page/freq file: " + extractorParameters.getWikipediaPageFreqFileName());
pageCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageFreqFileName()), "UTF-8")));
logger.info("form/index file: " + extractorParameters.getWikipediaFormIdFileName());
formIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormIdFileName()), "UTF-8")));
formIndexer = new SynchronizedIndexer();
formCounter = new SynchronizedCounter();
pageCounter = new SynchronizedCounter();
} catch (IOException e) {
logger.error(e);
}
startProcess(extractorParameters.getWikipediaXmlFileName());
}
@Override
public void filePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void disambiguationPage(String text, String title, int wikiID) {
try {
// logger.debug(title);
ExampleBuilder exampleBuilder = new ExampleBuilder(text, title, true);
List list = exampleBuilder.getExampleList();
StringBuilder sb = new StringBuilder();
Example example;
for (int i = 0; i < list.size(); i++) {
example = list.get(i);
if (!example.isEmpty()) {
sb.append(example.toString(exampleCounter.incrementAndGet()));
sb.append(CharacterTable.LINE_FEED);
}
}
synchronized (this) {
exampleWriter.print(sb.toString());
}
} catch (Exception e) {
logger.error(e);
}
}
@Override
public void printLog() {
if (printHeader) {
logger.info("total\tcontent\tredirect\tdisambiguation\tcategory\tpage\tform\ttime\tdate");
printHeader = false;
}
logger.info(decimalFormat.format(generalCount.intValue()) + "\t" + decimalFormat.format(countPageCounter) + "\t" + decimalFormat.format(redirectPageCounter) + "\t" + decimalFormat.format(disambiguationPageCounter) + "\t" + decimalFormat.format(categoryPageCounter) + "\t" + decimalFormat.format(pageCounter.size()) + "\t" + "\t" + decimalFormat.format(formCounter.size()) + "\t" + decimalFormat.format(genEnd.longValue() - genBegin.longValue()) + "\t" + new Date());
}
@Override
public void contentPage(String text, String title, int wikiID) {
try {
// logger.debug(title);
ExampleBuilder exampleBuilder = new ExampleBuilder(text, title, false);
List list = exampleBuilder.getExampleList();
StringBuilder sb = new StringBuilder();
Example example;
for (int i = 0; i < list.size(); i++) {
example = list.get(i);
if (!example.isEmpty()) {
sb.append(example.toString(exampleCounter.incrementAndGet()));
sb.append(CharacterTable.LINE_FEED);
}
}
synchronized (this) {
exampleWriter.print(sb.toString());
}
} catch (Exception e) {
logger.error("Error at page " + title + " (" + wikiID + ")");
logger.error(e);
}
}
/**
* This class is designed for building an Example.
*/
class ExampleBuilder {
public final static String EMPTY_CONTEXT = "";
public final static String END_OF_SENTENCE = ". ";
List exampleList;
boolean nominal;
//boolean disambiguation;
ExampleBuilder(String text, String title, boolean disambiguation) throws IOException {
exampleList = new ArrayList();
//this.disambiguation = disambiguation;
//ParsedPage parsedPage = wikiMarkupParser.parsePage(text);
String[] prefixes = {filePrefix, imagePrefix};
ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes);
ParsedPageTitle parsedPageTitle = new ParsedPageTitle(title);
PageTypeExtractor pageTypeExtractor = new PageTypeExtractor(text, parsedPageTitle.getForm());
nominal = pageTypeExtractor.isNominal();
if (!disambiguation) {
addPageExamples(parsedPage, parsedPageTitle);
addTextExample(parsedPage, parsedPageTitle);
addCategoryExamples(text, parsedPageTitle);
addSectionTitleExamples(parsedPage, parsedPageTitle);
//it's added in addPageExamples as left context
//addSuffixExample(parsedPageTitle);
}
addLinkExamples(parsedPage, parsedPageTitle);
//addTemplateExamples(text, parsedPageTitle);
}
public List getExampleList() {
return exampleList;
}
/*private void buildOtherExamples(String form, String page, String source, String leftContext, String rightContext, String type) {
Example example = new Example(form, page, source, leftContext, rightContext, type);
addExample(example);
addPersonSurnameExample(example);
//addRedirectLinkExamples(example);
} */
private void addExample(Example example) {
//todo: change with pageCounter no more than maximumNumberOfExamplesPerPage examples per sense not per form?
//if (formCounter.get(example.getForm()) <= maximumNumberOfExamplesPerPage) {
if (pageCounter.get(example.getPage()) <= maximumNumberOfExamplesPerPage) {
exampleList.add(example);
}
}
private void addSuffixExample(ParsedPageTitle parsedPageTitle) {
if (parsedPageTitle.hasSuffix()) {
String rightContext = parsedPageTitle.getSuffix();
Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_TITLE_SUFFIX);
addExample(example);
addNominalVariantExample(example);
addPersonSurnameExample(example);
addRedirectLinkExamples(example);
}
}
/**
* Adds the section examples
*/
private void addSectionTitleExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
{
Example example;
String sectionTitle;
String rightContext;
Matcher matcher;
for (Section section : parsedPage.getSections()) {
try {
sectionTitle = section.getTitle();
if (sectionTitle != null) {
matcher = sectionTitleSkipPattern.matcher(sectionTitle);
if (!matcher.matches()) {
rightContext = sectionTitle;
//todo: check if this disambiguation check can be removed, it should have been already done
if (!disambiguationPageSet.contains(parsedPageTitle.getPage())) {
//buildOtherExamples(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_SECTION_TITLE);
if (parsedPageTitle.isCompliant()) {
example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, rightContext, Example.CONTENT_FROM_SECTION_TITLE);
addExample(example);
addNominalVariantExample(example);
addPersonSurnameExample(example);
addRedirectLinkExamples(example);
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
}
else {
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
}
}
}
}
} catch (Exception ex) {
logger.error("Exception adding section examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
private void addTemplateExamples(String text, ParsedPageTitle parsedPageTitle) {
ArrayList templateList = WikiTemplateParser.parse(text, false);
logger.debug(parsedPageTitle.getPage());
for (WikiTemplate t : templateList) {
logger.debug(t.getHashMapOfParts());
}
}
/**
* Adds as example the text extracted from the page.
*
* @param parsedPage the text
* @param parsedPageTitle the parsed page title
* @throws IOException
*/
private void addTextExample(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
{
try {
if (parsedPageTitle.isCompliant()) {
Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, parsedPage.getText(), Example.CONTENT_FROM_TEXT);
exampleList.add(example);
addNominalVariantExample(example);
addPersonSurnameExample(example);
addRedirectLinkExamples(example);
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
}
else {
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
}
} catch (Exception ex) {
logger.error("Exception adding text examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
}
}
private void addCategoryExamples(String text, ParsedPageTitle parsedPageTitle) {
Matcher m = categoryPattern.matcher(text);
Example example;
String category;
int s, e;
for (; m.find(); ) {
try {
s = m.start(2);
e = m.end(2);
category = text.substring(s, e);
int j = category.indexOf(CharacterTable.VERTICAL_LINE);
if (j != -1) {
category = category.substring(0, j);
}
if (!category.equals(parsedPageTitle.getForm()) && !disambiguationPageSet.contains(parsedPageTitle.getPage())) {
//buildOtherExamples(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), category, EMPTY_CONTEXT, Example.CONTENT_FROM_CATEGORY);
if (parsedPageTitle.isCompliant()) {
example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), EMPTY_CONTEXT, category, Example.CONTENT_FROM_CATEGORY);
exampleList.add(example);
addNominalVariantExample(example);
addPersonSurnameExample(example);
addRedirectLinkExamples(example);
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
}
else {
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
}
}
} catch (Exception ex) {
logger.error("Exception adding category examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
/**
* Adds examples extracted from the page links.
*
* @param parsedPage the parsed page
* @param parsedPageTitle the parsed page title
* @throws IOException
*/
private void addLinkExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
{
Example example;
String redirectPage;
String secondRedirectPage;
List internalLinks;
ParsedPageLink parsedPageLink = null;
ParsedPageTitle parsedLinkTitle = null;
String sectionTitle;
for (Section section : parsedPage.getSections()) {
internalLinks = section.getLinks(Link.type.INTERNAL);
sectionTitle = section.getTitle();
for (Link link : internalLinks) {
try {
parsedPageLink = new ParsedPageLink(link);
if (parsedPageLink.isCompliant()) {
redirectPage = redirectPageMap.get(parsedPageLink.getPage());
//todo: check multiple redirects
if (redirectPage != null) {
secondRedirectPage = redirectPageMap.get(redirectPage);
if (secondRedirectPage != null) {
logger.warn(parsedPageLink.getPage() + " -> " + redirectPage + " -> " + secondRedirectPage);
//logger.error("exit on a second redirect!");
//System.exit(0);
parsedPageLink.setPage(secondRedirectPage);
}
else {
parsedPageLink.setPage(redirectPage);
}
}
//todo: comment first if used on the whole dump
//if (!disambiguationPageSet.contains(parsedPageLink.getPage())) {
if (contentPageMap.get(parsedPageLink.getPage()) != null && !disambiguationPageSet.contains(parsedPageLink.getPage())) {
parsedLinkTitle = new ParsedPageTitle(parsedPageLink.getPage());
//buildOtherExamples(parsedPageLink.getForm(), parsedPageLink.getPage(), parsedPageTitle.getPage(), buildLeftContext(parsedPageLink, parsedPageTitle, parsedLinkTitle, sectionTitle), parsedPageLink.getRightContext(), Example.CONTENT_FROM_LINK);
example = new Example(parsedPageLink.getForm(), parsedPageLink.getPage(), parsedPageTitle.getPage(), buildLeftContext(parsedPageLink, parsedPageTitle, parsedLinkTitle, sectionTitle), parsedPageLink.getRightContext(), Example.CONTENT_FROM_LINK);
addExample(example);
addPersonSurnameExample(example);
// it introduces noise into page/forms mapping
//addRedirectLinkExamples(example);
}
//logger.debug(parsedPageLink.getForm() + "\t" + parsedPageLink.getPage() + "\tKEEP");
}
else {
//logger.warn(parsedPageLink.getForm() + "\t" + parsedPageLink.getPage() + "\tREMOVED");
}
} catch (Exception ex) {
logger.error("Exception adding link examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
}
/**
* Creates the left context introduced by the page title (form), section title (if any) the link title (form,
* if different)
*
* @param parsedPageLink the parsed page link
* @param parsedPageTitle the parsed page title
* @param parsedLinkTitle the parsed page title of the link
* @param sectionTitle the section title
* @return the left context
*/
private String buildLeftContext(ParsedPageLink parsedPageLink, ParsedPageTitle parsedPageTitle, ParsedPageTitle parsedLinkTitle, String sectionTitle) {
StringBuilder leftContextBuilder = new StringBuilder();
// add the page title (form)
leftContextBuilder.append(parsedPageTitle.getForm());
leftContextBuilder.append(CharacterTable.SPACE);
if (sectionTitle != null) {
if (!sectionTitleSkipPattern.matcher(sectionTitle).find()) {
// add the section title
leftContextBuilder.append(sectionTitle);
leftContextBuilder.append(CharacterTable.SPACE);
}
}
parsedLinkTitle = new ParsedPageTitle(parsedPageLink.getPage());
if (!parsedPageLink.getForm().equals(parsedLinkTitle.getForm())) {
// add the linked page title (form)
leftContextBuilder.append(parsedLinkTitle.getForm());
leftContextBuilder.append(CharacterTable.SPACE);
if (parsedLinkTitle.hasSuffix()) {
// add the linked page title's suffix (form)
leftContextBuilder.append(parsedLinkTitle.getSuffix());
leftContextBuilder.append(CharacterTable.SPACE);
}
}
leftContextBuilder.append(END_OF_SENTENCE);
leftContextBuilder.append(parsedPageLink.getLeftContext());
return leftContextBuilder.toString();
}
private boolean isLowerCase(String s) {
/*for (int i = 0; i < s.length(); i++) {
if (Character.isUpperCase(s.charAt(i))) {
return false;
}
}
return true;*/
if (Character.isLowerCase(s.charAt(0))) {
return true;
}
return false;
}
private void addRedirectLinkExamples(Example example) {
//System.out.println("O\t" + example);
Set redirectSet = reverseRedirectPageMap.get(example.getPage());
Set formSet = new HashSet();
formSet.add(example.getForm());
if (redirectSet != null) {
Iterator it = redirectSet.iterator();
String redirectTitle;
ParsedPageTitle redirectParsedPageTitle;
String redirectForm;
Example redirectExample;
for (; it.hasNext(); ) {
try {
redirectTitle = it.next();
redirectParsedPageTitle = new ParsedPageTitle(redirectTitle);
if (redirectParsedPageTitle.isCompliant()) {
redirectForm = redirectParsedPageTitle.getForm();
if (isLowerCase(example.getForm())) {
redirectForm.toLowerCase();
}
//if (!example.getForm().equals(redirectForm)) {
if (!formSet.contains(redirectForm)) {
redirectExample = new Example(redirectForm, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
//System.out.println("R\t" + redirectExample);
addExample(redirectExample);
formSet.add(redirectForm);
addNominalVariantExample(redirectExample);
}
}
} catch (Exception ex) {
logger.error("Exception adding redirect link examples (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
}
private void addPageExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) //throws IOException
{
try {
if (!disambiguationPageSet.contains(parsedPageTitle.getPage())) {
Section firstSection = parsedPage.getSection(0);
String leftContext = EMPTY_CONTEXT;
String rightContext = EMPTY_CONTEXT;
if (firstSection != null) {
rightContext = firstSection.getText();
}
if (parsedPageTitle.hasSuffix()) {
leftContext = parsedPageTitle.getSuffix();
}
if (parsedPageTitle.isCompliant()) {
Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), leftContext, rightContext, Example.CONTENT_FROM_PAGE);
addExample(example);
addNominalVariantExample(example);
addPersonSurnameExample(example);
addRedirectPageExamples(example, parsedPageTitle);
//logger.debug(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tKEEP");
}
else {
//logger.warn(parsedPageTitle.getForm() + "\t" + parsedPageTitle.getPage() + "\tREMOVED");
}
}
} catch (Exception ex) {
logger.error("Exception adding page examples for page " + parsedPageTitle.getPage() + " (" + exampleCounter.intValue() + ")\n" + ex);
}
}
/**
* Adds alternative example created from the page title by using the redirect pages.
*
* @param example the original example
* @param parsedPageTitle the source parsed page title
* @return the list of alternative examples
*/
private void addRedirectPageExamples(Example example, ParsedPageTitle parsedPageTitle) {
String leftContext = EMPTY_CONTEXT;
Set redirectSet = reverseRedirectPageMap.get(parsedPageTitle.getPage());
if (redirectSet != null) {
Iterator it = redirectSet.iterator();
String redirectTitle;
ParsedPageTitle redirectParsedPageTitle;
Example redirectExample;
for (; it.hasNext(); ) {
try {
redirectTitle = it.next();
redirectParsedPageTitle = new ParsedPageTitle(redirectTitle);
if (redirectParsedPageTitle.hasSuffix()) {
leftContext = redirectParsedPageTitle.getSuffix();
}
if (redirectParsedPageTitle.isCompliant()) {
redirectExample = new Example(redirectParsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), leftContext, example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
addExample(redirectExample);
addNominalVariantExample(redirectExample);
}
//logger.debug(redirectParsedPageTitle.getForm() + "\t" + example.getForm());
} catch (Exception ex) {
logger.error("Exception adding redirect page examples (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
}
private void addPersonSurnameExample(Example example) {
PersonInfoMap.Person person = personInformationMap.get(example.getPage());
if (person != null) {
try {
String surname = person.getSurname();
if (surname.length() > 0) {
Example surnameExample = new Example(surname, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_PERSON_INFORMATION);
int fc = 0;
if ((fc = formCounter.get(surname)) <= maximumNumberOfExamplesPerPage) {
exampleList.add(surnameExample);
}
}
} catch (Exception ex) {
logger.error("Exception adding person info examples (" + exampleCounter.intValue() + ")\n" + ex);
}
}
}
/**
* Adds the form in lowercase if it's a nom.
*/
private void addNominalVariantExample(Example example) {
if (nominal) {
String form = example.getForm().toLowerCase();
Example formExample = new Example(form, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_NOMINAL);
int fc = 0;
if ((fc = formCounter.get(form)) <= maximumNumberOfExamplesPerPage) {
exampleList.add(formExample);
}
}
}
}
class Example {
public static final String CONTENT_FROM_PERSON_INFORMATION = "I";
public static final String CONTENT_FROM_REDIRECTION_PAGE = "R";
public static final String CONTENT_FROM_LINK = "L";
public static final String CONTENT_FROM_PAGE = "P";
public static final String CONTENT_FROM_CATEGORY = "C";
public static final String CONTENT_FROM_SECTION_TITLE = "S";
public static final String CONTENT_FROM_NOMINAL = "N";
public static final String CONTENT_FROM_TEXT = "T";
public static final String CONTENT_FROM_TITLE_SUFFIX = "U";
private String type;
private String source;
private String page;
private String leftContext;
private String form;
private String rightContext;
Example(String form, String page, String source, String leftContext, String rightContext, String type) {
//todo:remove from here
formCounter.add(form);
pageCounter.add(page);
this.form = form;
this.source = source;
this.page = page;//normalizePageName(page);
this.leftContext = leftContext;
this.rightContext = rightContext;
this.type = type;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public String getLeftContext() {
return leftContext;
}
public void setLeftContext(String leftContext) {
this.leftContext = leftContext;
}
public String getForm() {
return form;
}
public void setForm(String form) {
this.form = form;
}
public String getRightContext() {
return rightContext;
}
public void setRightContext(String rightContext) {
this.rightContext = rightContext;
}
public boolean isEmpty() {
if (form == null) {
return true;
}
if (form.length() == 0) {
return true;
}
if (page == null) {
return true;
}
if (page.length() == 0) {
return true;
}
if (source == null) {
return true;
}
if (source.length() == 0) {
return true;
}
if (leftContext == null) {
return true;
}
if (rightContext == null) {
return true;
}
if (leftContext.length() == 0 && rightContext.length() == 0) {
return true;
}
return false;
}
public String toString() {
return toString(0);
}
public String toString(int count) {
StringBuilder sb = new StringBuilder();
String tokenizedForm = tokenizer.tokenizedString(form);
int formIndex = formIndexer.get(tokenizedForm);
String pageIndex = contentPageMap.get(page);
sb.append(formIndex);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(pageIndex);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(tokenizedForm);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(page);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(source);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(count);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(type);
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(tokenizer.tokenizedString(leftContext));
sb.append(CharacterTable.HORIZONTAL_TABULATION);
sb.append(tokenizer.tokenizedString(rightContext));
return sb.toString();
}
}
@Override
public void categoryPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void templatePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void redirectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void portalPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void projectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void endProcess() {
super.endProcess();
try {
logger.info("writing " + decimalFormat.format(formCounter.size()) + " forms (counter)...");
writeFormCounter();
formCounterWriter.close();
} catch (IOException e) {
logger.error(e);
}
try {
logger.info("writing " + decimalFormat.format(pageCounter.size()) + " pages (counter)...");
pageCounter.write(pageCounterWriter);
pageCounterWriter.close();
} catch (IOException e) {
logger.error(e);
}
try {
logger.info("writing " + decimalFormat.format(formIndexer.size()) + " forms (indexer)...");
formIndexer.write(formIdWriter);
formIdWriter.close();
} catch (IOException e) {
logger.error(e);
}
logger.debug("closing the output stream...");
exampleWriter.close();
}
public void writeFormCounter() throws IOException {
SortedMap> sortedMap = formCounter.getSortedMap();
Iterator it = sortedMap.keySet().iterator();
AtomicInteger i;
for (; it.hasNext(); ) {
i = it.next();
List list = sortedMap.get(i);
for (int j = 0; j < list.size(); j++) {
formCounterWriter.print(i.toString());
formCounterWriter.print(CharacterTable.HORIZONTAL_TABULATION);
formCounterWriter.println(tokenizer.tokenizedString(list.get(j).toString()));
}
}
}
public static void main(String args[]) throws IOException {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
Options options = new Options();
try {
Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
Option numPageOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of pages to process (default all)").withLongOpt("num-pages").create("p");
Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + Defaults.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");
Option maximumFormFreqOpt = OptionBuilder.withArgName("max-freq").hasArg().withDescription("maximum frequency of wanted forms (default is " + WikipediaExtractor.DEFAULT_MAXIMUM_FORM_FREQ + ")").withLongOpt("max-freq").create("m");
options.addOption("h", "help", false, "print this message");
options.addOption("v", "version", false, "output version information and exit");
Option baseDirOpt = OptionBuilder.withDescription("if set, use the output folder as base dir").withLongOpt("base-dir").create();
options.addOption(wikipediaDumpOpt);
options.addOption(outputDirOpt);
options.addOption(numThreadOpt);
options.addOption(numPageOpt);
options.addOption(notificationPointOpt);
options.addOption(maximumFormFreqOpt);
options.addOption(baseDirOpt);
CommandLineParser parser = new PosixParser();
CommandLine line = parser.parse(options, args);
int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
if (line.hasOption("num-threads")) {
numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
}
int numPages = Defaults.DEFAULT_NUM_PAGES;
if (line.hasOption("num-pages")) {
numPages = Integer.parseInt(line.getOptionValue("num-pages"));
}
int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
if (line.hasOption("notification-point")) {
notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
}
//ExtractorParameters extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
ExtractorParameters extractorParameters;
if (line.hasOption("base-dir")) {
extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"), true);
}
else {
extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
}
File dest = new File(extractorParameters.getExtractionOutputDirName());
if (dest.mkdirs()) {
logger.info(dest + " created");
}
logger.debug(extractorParameters);
int maximumFormFreq = WikipediaExampleExtractor.DEFAULT_MAXIMUM_FORM_FREQ;
if (line.hasOption("max-freq")) {
maximumFormFreq = Integer.parseInt(line.getOptionValue("max-freq"));
}
logger.debug("filtering examples with frequency higher than " + maximumFormFreq + "...");
logger.debug("extracting examples (" + extractorParameters.getWikipediaExampleFileName() + ")...");
WikipediaExampleExtractor wikipediaExtractor = new WikipediaExampleExtractor(numThreads, numPages, extractorParameters.getLocale());
wikipediaExtractor.setNotificationPoint(notificationPoint);
wikipediaExtractor.setMaximumNumberOfExamplesPerPage(maximumFormFreq);
wikipediaExtractor.start(extractorParameters);
logger.info("extraction ended " + new Date());
} catch (ParseException e) {
// oops, something went wrong
System.out.println("Parsing failed: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExampleExtractor", "\n", options, "\n", true);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy