All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.utils.email.MailOptions Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.utils.email;

import java.io.File;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Configuration options to be used by {@link MailProcessor}. Includes options controlling the exact output format 
 * and which mail fields are included (body, to, from, subject, etc.)
 */
public class MailOptions {

  public static final String FROM = "FROM";
  public static final String TO = "TO";
  public static final String REFS = "REFS";
  public static final String SUBJECT = "SUBJECT";
  public static final Pattern DEFAULT_QUOTED_TEXT = Pattern.compile("^(\\||>)");

  private boolean stripQuotedText;
  private File input;
  private String outputDir;
  private String prefix;
  private int chunkSize;
  private Charset charset;
  private String separator;
  private String bodySeparator = "\n";
  private boolean includeBody;
  private Pattern[] patternsToMatch;
  //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch.  See MailToRecMapper
  private Map patternOrder;

  //the regular expression to use for identifying quoted text.
  private Pattern quotedTextPattern = DEFAULT_QUOTED_TEXT;

  public File getInput() {
    return input;
  }

  public void setInput(File input) {
    this.input = input;
  }

  public String getOutputDir() {
    return outputDir;
  }

  /**
   * Sets the output directory where sequence files will be written.
   */
  public void setOutputDir(String outputDir) {
    this.outputDir = outputDir;
  }

  public String getPrefix() {
    return prefix;
  }

  /**
   * Sets the prefix that is combined with the archive name and with message ids to create {@code SequenceFile} keys. 
   * @param prefix The name of the directory containing the mail archive is commonly used.
   */
  public void setPrefix(String prefix) {
    this.prefix = prefix;
  }

  public int getChunkSize() {
    return chunkSize;
  }

  /**
   * Sets the size of each generated sequence file, in Megabytes.
   */
  public void setChunkSize(int chunkSize) {
    this.chunkSize = chunkSize;
  }

  public Charset getCharset() {
    return charset;
  }

  /**
   * Sets the encoding of the input
   */
  public void setCharset(Charset charset) {
    this.charset = charset;
  }

  public String getSeparator() {
    return separator;
  }

  /**
   * Sets the separator to use in the output between metadata items (to, from, etc.).
   */
  public void setSeparator(String separator) {
    this.separator = separator;
  }

  public String getBodySeparator() {
    return bodySeparator;
  }

  /**
   * Sets the separator to use in the output between lines in the body, the default is "\n".
   */
  public void setBodySeparator(String bodySeparator) {
    this.bodySeparator = bodySeparator;
  }

  public boolean isIncludeBody() {
    return includeBody;
  }

  /**
   * Sets whether mail bodies are included in the output
   */
  public void setIncludeBody(boolean includeBody) {
    this.includeBody = includeBody;
  }

  public Pattern[] getPatternsToMatch() {
    return patternsToMatch;
  }

  /**
   * Sets the list of patterns to be applied in the given order to extract metadata fields (to, from, subject, etc.)
   *  from the input 
   */
  public void setPatternsToMatch(Pattern[] patternsToMatch) {
    this.patternsToMatch = patternsToMatch;
  }

  public Map getPatternOrder() {
    return patternOrder;
  }

  public void setPatternOrder(Map patternOrder) {
    this.patternOrder = patternOrder;
  }

  /**
   *
   * @return true if we should strip out quoted email text
   */
  public boolean isStripQuotedText() {
    return stripQuotedText;
  }

  /**
   *
   * Sets whether quoted text such as lines starting with | or > is striped off.
   */
  public void setStripQuotedText(boolean stripQuotedText) {
    this.stripQuotedText = stripQuotedText;
  }

  public Pattern getQuotedTextPattern() {
    return quotedTextPattern;
  }

  /**
   * Sets the {@link java.util.regex.Pattern} to use to identify lines that are quoted text. Default is | and >
   * @see #setStripQuotedText(boolean)
   */
  public void setQuotedTextPattern(Pattern quotedTextPattern) {
    this.quotedTextPattern = quotedTextPattern;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy