org.apache.mahout.text.wikipedia.WikipediaMapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.
There is a newer version: 0.13.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.text.wikipedia;

import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.GenericsUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Maps over Wikipedia xml format and output all document having the category listed in the input category
 * file
 * 
 */
public class WikipediaMapper extends Mapper {

  private static final Logger log = LoggerFactory.getLogger(WikipediaMapper.class);

  private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s]");

  private static final String START_DOC = "";

  private static final String END_DOC = "";

  private static final Pattern TITLE = Pattern.compile("(.*)<\\/title>");

  private static final String REDIRECT = "<redirect />";

  private Set<String> inputCategories;

  private boolean exactMatchOnly;

  private boolean all;

  private boolean removeLabels;

  @Override
  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    String content = value.toString();
    if (content.contains(REDIRECT)) {
      return;
    }
    String document;
    String title;
    try {
      document = getDocument(content);
      title = getTitle(content);
    } catch (RuntimeException e) {
      // TODO: reporter.getCounter("Wikipedia", "Parse errors").increment(1);
      return;
    }

    String catMatch = findMatchingCategory(document);
    if (!all) {
      if ("Unknown".equals(catMatch)) {
        return;
      }
    }

    document = StringEscapeUtils.unescapeHtml4(document);    
    if (removeLabels) {
      document = removeCategoriesFromText(document);
      // Reject documents with malformed tags
      if (document == null) {
        return;
      }
    }

    // write out in Bayes input style: key: /Category/document_name
    String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
        SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");

    context.write(new Text(category), new Text(document));
  }

  @Override
  protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
 
    Set<String> newCategories = new HashSet<>();
    DefaultStringifier<Set<String>> setStringifier =
          new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));

    String categoriesStr = conf.get("wikipedia.categories");
    inputCategories = setStringifier.fromString(categoriesStr);
    exactMatchOnly = conf.getBoolean("exact.match.only", false);
    all = conf.getBoolean("all.files", false);
    removeLabels = conf.getBoolean("remove.labels",false);
    log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}",
            inputCategories.size(), all, exactMatchOnly, removeLabels);
  }

  private static String getDocument(String xml) {
    int start = xml.indexOf(START_DOC) + START_DOC.length();
    int end = xml.indexOf(END_DOC, start);
    return xml.substring(start, end);
  }

  private static String getTitle(CharSequence xml) {
    Matcher m = TITLE.matcher(xml);
    return m.find() ? m.group(1) : "";
  }

  private String findMatchingCategory(String document) {
    int startIndex = 0;
    int categoryIndex;
    while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
      categoryIndex += 11;
      int endIndex = document.indexOf("]]", categoryIndex);
      if (endIndex >= document.length() || endIndex < 0) {
        break;
      }
      String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
      if (exactMatchOnly && inputCategories.contains(category)) {
        return category.toLowerCase(Locale.ENGLISH);
      }
      if (!exactMatchOnly) {
        for (String inputCategory : inputCategories) {
          if (category.contains(inputCategory)) { // we have an inexact match
            return inputCategory.toLowerCase(Locale.ENGLISH);
          }
        }
      }
      startIndex = endIndex;
    }
    return "Unknown";
  }

  private String removeCategoriesFromText(String document) {
    int startIndex = 0;
    int categoryIndex;
    try {
      while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
        int endIndex = document.indexOf("]]", categoryIndex);
        if (endIndex >= document.length() || endIndex < 0) {
          break;
        }
        document = document.replace(document.substring(categoryIndex, endIndex + 2), "");
        if (categoryIndex < document.length()) {
          startIndex = categoryIndex;
        } else {
          break;
        }
      }
    } catch(StringIndexOutOfBoundsException e) {
      return null;
    }
    return document;
  }
}
</code></pre>    <br/>
    <br/>
    <div id="right-banner">
            </div>
    <div id="left-banner">
            </div>
<div class='clear'></div>
    <aside class="related-items">
        <section>
            <div class="panel panel-primary">
                <div class="panel-heading margin-bottom">Related Artifacts</div>
                <div class="">
                    <a title='This artifact is from the group mysql' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/mysql/mysql-connector-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> mysql-connector-java <small class='group-info' >mysql</small></a><br/><a title='This artifact is from the group com.github.codedrinker' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.codedrinker/facebook-messenger' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> facebook-messenger <small class='group-info' >com.github.codedrinker</small></a><br/><a title='This artifact is from the group org.seleniumhq.selenium' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium/selenium-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> selenium-java <small class='group-info' >org.seleniumhq.selenium</small></a><br/><a title='This artifact is from the group com.github.sola92' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.sola92/instagram-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> instagram-java <small class='group-info' >com.github.sola92</small></a><br/><a title='This artifact is from the group com.google.code.gson' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.gson/gson' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> gson <small class='group-info' >com.google.code.gson</small></a><br/><a title='This artifact is from the group org.apache.poi' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi/poi' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> poi <small class='group-info' >org.apache.poi</small></a><br/><a title='This artifact is from the group org.apache.httpcomponents' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.httpcomponents/httpclient' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> httpclient <small class='group-info' >org.apache.httpcomponents</small></a><br/><a title='This artifact is from the group org.json' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.json/json' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> json <small class='group-info' >org.json</small></a><br/><a title='This artifact is from the group com.google.code.facebook-java-api' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.facebook-java-api/facebook-java-api' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> facebook-java-api <small class='group-info' >com.google.code.facebook-java-api</small></a><br/><a title='This artifact is from the group org.apache.poi' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi/poi-ooxml' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> poi-ooxml <small class='group-info' >org.apache.poi</small></a><br/><a title='This artifact is from the group com.fasterxml.jackson.core' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.fasterxml.jackson.core/jackson-databind' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> jackson-databind <small class='group-info' >com.fasterxml.jackson.core</small></a><br/><a title='This artifact is from the group junit' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/junit/junit' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> junit <small class='group-info' >junit</small></a><br/><a title='This artifact is from the group org.primefaces' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.primefaces/primefaces' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> primefaces <small class='group-info' >org.primefaces</small></a><br/><a title='This artifact is from the group com.github.noraui' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.noraui/ojdbc7' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> ojdbc7 <small class='group-info' >com.github.noraui</small></a><br/><a title='This artifact is from the group com.jfoenix' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.jfoenix/jfoenix' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> jfoenix <small class='group-info' >com.jfoenix</small></a><br/><a title='This artifact is from the group org.testng' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.testng/testng' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> testng <small class='group-info' >org.testng</small></a><br/><a title='This artifact is from the group com.googlecode.json-simple' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.googlecode.json-simple/json-simple' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> json-simple <small class='group-info' >com.googlecode.json-simple</small></a><br/><a title='This artifact is from the group org.seleniumhq.selenium' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium/selenium-server' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> selenium-server <small class='group-info' >org.seleniumhq.selenium</small></a><br/><a title='This artifact is from the group com.itextpdf' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.itextpdf/itextpdf' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> itextpdf <small class='group-info' >com.itextpdf</small></a><br/><a title='This artifact is from the group org.springframework' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework/spring-core' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> spring-core <small class='group-info' >org.springframework</small></a><br/>                </div>
            </div>
        </section>
        <section>
            <div class="panel panel-primary">
                <div class="panel-heading margin-bottom">Related Groups</div>
                <div class="">
                    <a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.springframework</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.poi</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.hibernate' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.hibernate</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework.boot' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.springframework.boot</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.fasterxml.jackson.core' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.fasterxml.jackson.core</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.itextpdf' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.itextpdf</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.seleniumhq.selenium</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/mysql' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> mysql</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.finos.legend.engine' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.finos.legend.engine</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.httpcomponents' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.httpcomponents</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.logging.log4j' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.logging.log4j</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.openjfx' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.openjfx</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.commons' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.commons</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.json' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.json</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.guava' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.guava</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.zxing' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.zxing</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/net.sf.jasperreports' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> net.sf.jasperreports</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/javax.xml.bind' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> javax.xml.bind</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/ojdbc' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> ojdbc</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.facebook-java-api' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.code.facebook-java-api</a><br/>                </div>
            </div>
        </section>
    </aside>
    <div class='clear'></div>
</main>
</div>
<br/><br/>
    <div class="align-center">© 2015 - 2024 <a href="/legal-notice.php">Weber Informatics LLC</a> | <a href="/data-protection.php">Privacy Policy</a></div>
<br/><br/><br/><br/><br/><br/>
</body>
</html>