All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.text.SequenceFilesFromMailArchives Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.text;

import org.apache.commons.io.DirectoryWalker;
import org.apache.commons.io.comparator.CompositeFileComparator;
import org.apache.commons.io.comparator.DirectoryFileComparator;
import org.apache.commons.io.comparator.PathFileComparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.utils.email.MailOptions;
import org.apache.mahout.utils.email.MailProcessor;
import org.apache.mahout.utils.io.ChunkedWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Converts a directory of gzipped mail archives into SequenceFiles of specified
 * chunkSize. This class is similar to {@link SequenceFilesFromDirectory} except
 * it uses block-compressed {@link org.apache.hadoop.io.SequenceFile}s and parses out the subject and
 * body text of each mail message into a separate key/value pair.
 */
public final class SequenceFilesFromMailArchives extends AbstractJob {

  private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);

  public static final String[] CHUNK_SIZE_OPTION     = {"chunkSize", "chunk"};
  public static final String[] KEY_PREFIX_OPTION     = {"keyPrefix", "prefix"};
  public static final String[] CHARSET_OPTION        = {"charset", "c"};
  public static final String[] SUBJECT_OPTION        = {"subject", "s"};
  public static final String[] TO_OPTION             = {"to", "to"};
  public static final String[] FROM_OPTION           = {"from", "from"};
  public static final String[] REFERENCES_OPTION     = {"references", "refs"};
  public static final String[] BODY_OPTION           = {"body", "b"};
  public static final String[] STRIP_QUOTED_OPTION   = {"stripQuoted", "q"};
  public static final String[] QUOTED_REGEX_OPTION   = {"quotedRegex", "regex"};
  public static final String[] SEPARATOR_OPTION      = {"separator", "sep"};
  public static final String[] BODY_SEPARATOR_OPTION = {"bodySeparator", "bodySep"};
  public static final String BASE_INPUT_PATH         = "baseinputpath";

  private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;

  public void createSequenceFiles(MailOptions options) throws IOException {
    try (ChunkedWriter writer =
             new ChunkedWriter(getConf(), options.getChunkSize(), new Path(options.getOutputDir()))){
      MailProcessor processor = new MailProcessor(options, options.getPrefix(), writer);
      if (options.getInput().isDirectory()) {
        PrefixAdditionDirectoryWalker walker = new PrefixAdditionDirectoryWalker(processor, writer);
        walker.walk(options.getInput());
        log.info("Parsed {} messages from {}", walker.getMessageCount(), options.getInput().getAbsolutePath());
      } else {
        long start = System.currentTimeMillis();
        long cnt = processor.parseMboxLineByLine(options.getInput());
        long finish = System.currentTimeMillis();
        log.info("Parsed {} messages from {} in time: {}", cnt, options.getInput().getAbsolutePath(), finish - start);
      }
    }
  }

  private static class PrefixAdditionDirectoryWalker extends DirectoryWalker {

    @SuppressWarnings("unchecked")
    private static final Comparator FILE_COMPARATOR = new CompositeFileComparator(
        DirectoryFileComparator.DIRECTORY_REVERSE, PathFileComparator.PATH_COMPARATOR);

    private final Deque processors = new ArrayDeque<>();
    private final ChunkedWriter writer;
    private final Deque messageCounts = new ArrayDeque<>();

    public PrefixAdditionDirectoryWalker(MailProcessor processor, ChunkedWriter writer) {
      processors.addFirst(processor);
      this.writer = writer;
      messageCounts.addFirst(0L);
    }

    public void walk(File startDirectory) throws IOException {
      super.walk(startDirectory, null);
    }

    public long getMessageCount() {
      return messageCounts.getFirst();
    }

    @Override
    protected void handleDirectoryStart(File current, int depth, Collection results) throws IOException {
      if (depth > 0) {
        log.info("At {}", current.getAbsolutePath());
        MailProcessor processor = processors.getFirst();
        MailProcessor subDirProcessor = new MailProcessor(processor.getOptions(), processor.getPrefix()
            + File.separator + current.getName(), writer);
        processors.push(subDirProcessor);
        messageCounts.push(0L);
      }
    }

    @Override
    protected File[] filterDirectoryContents(File directory, int depth, File[] files) throws IOException {
      Arrays.sort(files, FILE_COMPARATOR);
      return files;
    }

    @Override
    protected void handleFile(File current, int depth, Collection results) throws IOException {
      MailProcessor processor = processors.getFirst();
      long currentDirMessageCount = messageCounts.pop();
      try {
        currentDirMessageCount += processor.parseMboxLineByLine(current);
      } catch (IOException e) {
        throw new IllegalStateException("Error processing " + current, e);
      }
      messageCounts.push(currentDirMessageCount);
    }

    @Override
    protected void handleDirectoryEnd(File current, int depth, Collection results) throws IOException {
      if (depth > 0) {
        final long currentDirMessageCount = messageCounts.pop();
        log.info("Parsed {} messages from directory {}", currentDirMessageCount, current.getAbsolutePath());

        processors.pop();

        // aggregate message counts
        long parentDirMessageCount = messageCounts.pop();
        parentDirMessageCount += currentDirMessageCount;
        messageCounts.push(parentDirMessageCount);
      }
    }
  }

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new Configuration(), new SequenceFilesFromMailArchives(), args);
  }

  @Override
  public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());

    addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
    addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
    addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
      "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
    addFlag(SUBJECT_OPTION[0], SUBJECT_OPTION[1], "Include the Mail subject as part of the text.  Default is false");
    addFlag(TO_OPTION[0], TO_OPTION[1], "Include the to field in the text.  Default is false");
    addFlag(FROM_OPTION[0], FROM_OPTION[1], "Include the from field in the text.  Default is false");
    addFlag(REFERENCES_OPTION[0], REFERENCES_OPTION[1],
      "Include the references field in the text.  Default is false");
    addFlag(BODY_OPTION[0], BODY_OPTION[1], "Include the body in the output.  Default is false");
    addFlag(STRIP_QUOTED_OPTION[0], STRIP_QUOTED_OPTION[1],
      "Strip (remove) quoted email text in the body.  Default is false");
    addOption(QUOTED_REGEX_OPTION[0], QUOTED_REGEX_OPTION[1],
        "Specify the regex that identifies quoted text.  "
          + "Default is to look for > or | at the beginning of the line.");
    addOption(SEPARATOR_OPTION[0], SEPARATOR_OPTION[1],
        "The separator to use between metadata items (to, from, etc.).  Default is \\n", "\n");
    addOption(BODY_SEPARATOR_OPTION[0], BODY_SEPARATOR_OPTION[1],
        "The separator to use between lines in the body.  Default is \\n.  "
          + "Useful to change if you wish to have the message be on one line", "\n");

    addOption(DefaultOptionCreator.helpOption());
    Map> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }
    File input = getInputFile();
    String outputDir = getOutputPath().toString();

    int chunkSize = 64;
    if (hasOption(CHUNK_SIZE_OPTION[0])) {
      chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
    }

    String prefix = "";
    if (hasOption(KEY_PREFIX_OPTION[0])) {
      prefix = getOption(KEY_PREFIX_OPTION[0]);
    }

    Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
    MailOptions options = new MailOptions();
    options.setInput(input);
    options.setOutputDir(outputDir);
    options.setPrefix(prefix);
    options.setChunkSize(chunkSize);
    options.setCharset(charset);

    List patterns = new ArrayList<>(5);
    // patternOrder is used downstream so that we can know what order the text
    // is in instead of encoding it in the string, which
    // would require more processing later to remove it pre feature selection.
    Map patternOrder = new HashMap<>();
    int order = 0;
    if (hasOption(FROM_OPTION[0])) {
      patterns.add(MailProcessor.FROM_PREFIX);
      patternOrder.put(MailOptions.FROM, order++);
    }
    if (hasOption(TO_OPTION[0])) {
      patterns.add(MailProcessor.TO_PREFIX);
      patternOrder.put(MailOptions.TO, order++);
    }
    if (hasOption(REFERENCES_OPTION[0])) {
      patterns.add(MailProcessor.REFS_PREFIX);
      patternOrder.put(MailOptions.REFS, order++);
    }
    if (hasOption(SUBJECT_OPTION[0])) {
      patterns.add(MailProcessor.SUBJECT_PREFIX);
      patternOrder.put(MailOptions.SUBJECT, order += 1);
    }
    options.setStripQuotedText(hasOption(STRIP_QUOTED_OPTION[0]));

    options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
    options.setPatternOrder(patternOrder);
    options.setIncludeBody(hasOption(BODY_OPTION[0]));

    if (hasOption(SEPARATOR_OPTION[0])) {
      options.setSeparator(getOption(SEPARATOR_OPTION[0]));
    } else {
      options.setSeparator("\n");
    }

    if (hasOption(BODY_SEPARATOR_OPTION[0])) {
      options.setBodySeparator(getOption(BODY_SEPARATOR_OPTION[0]));
    }

    if (hasOption(QUOTED_REGEX_OPTION[0])) {
      options.setQuotedTextPattern(Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])));
    }

    if (getOption(DefaultOptionCreator.METHOD_OPTION,
      DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
      runSequential(options);
    } else {
      runMapReduce(getInputPath(), getOutputPath());
    }

    return 0;
  }

  private int runSequential(MailOptions options)
    throws IOException, InterruptedException, NoSuchMethodException {

    long start = System.currentTimeMillis();
    createSequenceFiles(options);
    long finish = System.currentTimeMillis();
    log.info("Conversion took {}ms", finish - start);

    return 0;
  }

  private int runMapReduce(Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException {

    Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromMailArchivesMapper.class,
      Text.class, Text.class, SequenceFileOutputFormat.class, "SequentialFilesFromMailArchives");

    Configuration jobConfig = job.getConfiguration();

    if (hasOption(KEY_PREFIX_OPTION[0])) {
      jobConfig.set(KEY_PREFIX_OPTION[1], getOption(KEY_PREFIX_OPTION[0]));
    }

    int chunkSize = 0;
    if (hasOption(CHUNK_SIZE_OPTION[0])) {
      chunkSize = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
      jobConfig.set(CHUNK_SIZE_OPTION[0], String.valueOf(chunkSize));
    }

    Charset charset;
    if (hasOption(CHARSET_OPTION[0])) {
      charset = Charset.forName(getOption(CHARSET_OPTION[0]));
      jobConfig.set(CHARSET_OPTION[0], charset.displayName());
    }

    if (hasOption(FROM_OPTION[0])) {
      jobConfig.set(FROM_OPTION[1], "true");
    }

    if (hasOption(TO_OPTION[0])) {
      jobConfig.set(TO_OPTION[1], "true");
    }

    if (hasOption(REFERENCES_OPTION[0])) {
      jobConfig.set(REFERENCES_OPTION[1], "true");
    }

    if (hasOption(SUBJECT_OPTION[0])) {
      jobConfig.set(SUBJECT_OPTION[1], "true");
    }

    if (hasOption(QUOTED_REGEX_OPTION[0])) {
      jobConfig.set(QUOTED_REGEX_OPTION[1], Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])).toString());
    }

    if (hasOption(SEPARATOR_OPTION[0])) {
      jobConfig.set(SEPARATOR_OPTION[1], getOption(SEPARATOR_OPTION[0]));
    } else {
      jobConfig.set(SEPARATOR_OPTION[1], "\n");
    }

    if (hasOption(BODY_OPTION[0])) {
      jobConfig.set(BODY_OPTION[1], "true");
    } else {
      jobConfig.set(BODY_OPTION[1], "false");
    }

    if (hasOption(BODY_SEPARATOR_OPTION[0])) {
      jobConfig.set(BODY_SEPARATOR_OPTION[1], getOption(BODY_SEPARATOR_OPTION[0]));
    } else {
      jobConfig.set(BODY_SEPARATOR_OPTION[1], "\n");
    }

    FileSystem fs = FileSystem.get(jobConfig);
    FileStatus fsFileStatus = fs.getFileStatus(inputPath);

    jobConfig.set(BASE_INPUT_PATH, inputPath.toString());
    String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
    FileInputFormat.setInputPaths(job, inputDirList);

    long chunkSizeInBytes = chunkSize * 1024 * 1024;
    // need to set this to a multiple of the block size, or no split happens
    FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);

    // set the max split locations, otherwise we get nasty debug stuff
    jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    return 0;
  }
}