All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.text.SequenceFilesFromMailArchivesMapper Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.text;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.utils.email.MailOptions;
import org.apache.mahout.utils.email.MailProcessor;

import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.BODY_SEPARATOR_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHARSET_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.CHUNK_SIZE_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.FROM_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.KEY_PREFIX_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.QUOTED_REGEX_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.REFERENCES_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.SEPARATOR_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.STRIP_QUOTED_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.SUBJECT_OPTION;
import static org.apache.mahout.text.SequenceFilesFromMailArchives.TO_OPTION;

/**
 * Map Class for the SequenceFilesFromMailArchives job
 */
public class SequenceFilesFromMailArchivesMapper extends Mapper {

  private Text outKey = new Text();
  private Text outValue = new Text();

  private static final Pattern MESSAGE_START = Pattern.compile(
    "^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
  private static final Pattern MESSAGE_ID_PREFIX = Pattern.compile(
    "^message-id: <(.*)>$", Pattern.CASE_INSENSITIVE);

  private MailOptions options;

  @Override
  public void setup(Context context) throws IOException, InterruptedException {

    Configuration configuration = context.getConfiguration();

    // absorb all of the options into the MailOptions object
    this.options = new MailOptions();

    options.setPrefix(configuration.get(KEY_PREFIX_OPTION[1], ""));

    if (!configuration.get(CHUNK_SIZE_OPTION[0], "").equals("")) {
      options.setChunkSize(configuration.getInt(CHUNK_SIZE_OPTION[0], 64));
    }

    if (!configuration.get(CHARSET_OPTION[0], "").equals("")) {
      Charset charset = Charset.forName(configuration.get(CHARSET_OPTION[0], "UTF-8"));
      options.setCharset(charset);
    } else {
      Charset charset = Charset.forName("UTF-8");
      options.setCharset(charset);
    }

    List patterns = Lists.newArrayListWithCapacity(5);
    // patternOrder is used downstream so that we can know what order the
    // text is in instead
    // of encoding it in the string, which
    // would require more processing later to remove it pre feature
    // selection.
    Map patternOrder = Maps.newHashMap();
    int order = 0;
    if (!configuration.get(FROM_OPTION[1], "").equals("")) {
      patterns.add(MailProcessor.FROM_PREFIX);
      patternOrder.put(MailOptions.FROM, order++);
    }

    if (!configuration.get(TO_OPTION[1], "").equals("")) {
      patterns.add(MailProcessor.TO_PREFIX);
      patternOrder.put(MailOptions.TO, order++);
    }

    if (!configuration.get(REFERENCES_OPTION[1], "").equals("")) {
      patterns.add(MailProcessor.REFS_PREFIX);
      patternOrder.put(MailOptions.REFS, order++);
    }

    if (!configuration.get(SUBJECT_OPTION[1], "").equals("")) {
      patterns.add(MailProcessor.SUBJECT_PREFIX);
      patternOrder.put(MailOptions.SUBJECT, order += 1);
    }

    options.setStripQuotedText(configuration.getBoolean(STRIP_QUOTED_OPTION[1], false));

    options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
    options.setPatternOrder(patternOrder);

    options.setIncludeBody(configuration.getBoolean(BODY_OPTION[1], false));

    options.setSeparator("\n");
    if (!configuration.get(SEPARATOR_OPTION[1], "").equals("")) {
      options.setSeparator(configuration.get(SEPARATOR_OPTION[1], ""));
    }
    if (!configuration.get(BODY_SEPARATOR_OPTION[1], "").equals("")) {
      options.setBodySeparator(configuration.get(BODY_SEPARATOR_OPTION[1], ""));
    }
    if (!configuration.get(QUOTED_REGEX_OPTION[1], "").equals("")) {
      options.setQuotedTextPattern(Pattern.compile(configuration.get(QUOTED_REGEX_OPTION[1], "")));
    }

  }

  public long parseMailboxLineByLine(String filename, InputStream mailBoxInputStream, Context context)
    throws IOException, InterruptedException {
    long messageCount = 0;
    try {
      StringBuilder contents = new StringBuilder();
      StringBuilder body = new StringBuilder();
      Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
      Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
      String[] patternResults = new String[options.getPatternsToMatch().length];
      Matcher[] matches = new Matcher[options.getPatternsToMatch().length];
      for (int i = 0; i < matches.length; i++) {
        matches[i] = options.getPatternsToMatch()[i].matcher("");
      }

      String messageId = null;
      boolean inBody = false;
      Pattern quotedTextPattern = options.getQuotedTextPattern();

      for (String nextLine : new FileLineIterable(mailBoxInputStream, options.getCharset(), false, filename)) {
        if (!options.isStripQuotedText() || !quotedTextPattern.matcher(nextLine).find()) {
          for (int i = 0; i < matches.length; i++) {
            Matcher matcher = matches[i];
            matcher.reset(nextLine);
            if (matcher.matches()) {
              patternResults[i] = matcher.group(1);
            }
          }

          // only start appending body content after we've seen a message ID
          if (messageId != null) {
            // first, see if we hit the end of the message
            messageBoundaryMatcher.reset(nextLine);
            if (messageBoundaryMatcher.matches()) {
              // done parsing this message ... write it out
              String key = generateKey(filename, options.getPrefix(), messageId);
              // if this ordering changes, then also change
              // FromEmailToDictionaryMapper
              writeContent(options.getSeparator(), contents, body, patternResults);

              this.outKey.set(key);
              this.outValue.set(contents.toString());
              context.write(this.outKey, this.outValue);
              contents.setLength(0); // reset the buffer
              body.setLength(0);
              messageId = null;
              inBody = false;
            } else {
              if (inBody && options.isIncludeBody()) {
                if (!nextLine.isEmpty()) {
                  body.append(nextLine).append(options.getBodySeparator());
                }
              } else {
                // first empty line we see after reading the message Id
                // indicates that we are in the body ...
                inBody = nextLine.isEmpty();
              }
            }
          } else {
            if (nextLine.length() > 14) {
              messageIdMatcher.reset(nextLine);
              if (messageIdMatcher.matches()) {
                messageId = messageIdMatcher.group(1);
                ++messageCount;
              }
            }
          }
        }
      }
      // write the last message in the file if available
      if (messageId != null) {
        String key = generateKey(filename, options.getPrefix(), messageId);
        writeContent(options.getSeparator(), contents, body, patternResults);
        this.outKey.set(key);
        this.outValue.set(contents.toString());
        context.write(this.outKey, this.outValue);
        contents.setLength(0); // reset the buffer
      }
    } catch (FileNotFoundException ignored) {

    }
    return messageCount;
  }

  protected static String generateKey(String mboxFilename, String prefix, String messageId) {
    return Joiner.on(Path.SEPARATOR).join(Lists.newArrayList(prefix, mboxFilename, messageId).iterator());
  }

  private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
    String matchesString = Joiner.on(separator).useForNull("").join(Arrays.asList(matches).iterator());
    contents.append(matchesString).append(separator).append(body);
  }

  public void map(IntWritable key, BytesWritable value, Context context)
    throws IOException, InterruptedException {
    Configuration configuration = context.getConfiguration();
    Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
    String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);
    ByteArrayInputStream is = new ByteArrayInputStream(value.getBytes());
    parseMailboxLineByLine(relativeFilePath, is, context);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy