org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.
There is a newer version: 0.13.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.text.wikipedia;

import com.google.common.io.Closeables;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.mahout.common.ClassUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * Maps over Wikipedia xml format and output all document having the category listed in the input category
 * file
 * 
 */
public class WikipediaDatasetCreatorMapper extends Mapper {

  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);

  private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("");
  private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("");

  private List inputCategories;
  private List inputCategoryPatterns;
  private boolean exactMatchOnly;
  private Analyzer analyzer;

  @Override
  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String document = value.toString();
    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN.matcher(
        OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
    String catMatch = findMatchingCategory(document);
    if (!"Unknown".equals(catMatch)) {
      StringBuilder contents = new StringBuilder(1000);
      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
      CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
      stream.reset();
      while (stream.incrementToken()) {
        contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
      }
      context.write(
          new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
          new Text(contents.toString()));
      stream.end();
      Closeables.close(stream, true);
    }
  }

  @Override
  protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);

    Configuration conf = context.getConfiguration();

    if (inputCategories == null) {
      Set newCategories = new HashSet<>();
      DefaultStringifier> setStringifier =
          new DefaultStringifier<>(conf, GenericsUtil.getClass(newCategories));
      String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
      Set inputCategoriesSet = setStringifier.fromString(categoriesStr);
      inputCategories = new ArrayList<>(inputCategoriesSet);
      inputCategoryPatterns = new ArrayList<>(inputCategories.size());
      for (String inputCategory : inputCategories) {
        inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*"));
      }

    }

    exactMatchOnly = conf.getBoolean("exact.match.only", false);

    if (analyzer == null) {
      String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName());
      analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class);
    }

    log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}",
             inputCategories.size(), exactMatchOnly, analyzer.getClass().getName());
  }

  private String findMatchingCategory(String document) {
    int startIndex = 0;
    int categoryIndex;
    while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
      categoryIndex += 11;
      int endIndex = document.indexOf("]]", categoryIndex);
      if (endIndex >= document.length() || endIndex < 0) {
        break;
      }
      String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
      // categories.add(category.toLowerCase());
      if (exactMatchOnly && inputCategories.contains(category)) {
        return category;
      }
      if (!exactMatchOnly) {
        for (int i = 0; i < inputCategories.size(); i++) {
          String inputCategory = inputCategories.get(i);
          Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
          if (inputCategoryPattern.matcher(category).matches()) { // inexact match with word boundary.
            return inputCategory;
          }
        }
      }
      startIndex = endIndex;
    }
    return "Unknown";
  }
}