All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.contrib.text.TokenizerFactory Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.contrib.text;

import java.io.Serializable;
import java.util.Locale;
import java.util.Scanner;
import java.util.Set;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;

/**
 * Factory class that constructs {@link Tokenizer} instances for input strings that use a fixed
 * set of delimiters, skip patterns, locales, and sets of indices to keep or drop.
 */
public class TokenizerFactory implements Serializable {

  private static TokenizerFactory DEFAULT_INSTANCE = new TokenizerFactory(null, null, null,
      ImmutableSet.of(), true);
  
  private final String delim;
  private final String skip;
  private final Locale locale;
  private final Set indices;
  private final boolean keep;
  
  /**
   * Returns a default {@code TokenizerFactory} that uses whitespace as a delimiter and does
   * not skip any input fields.
   * @return The default {@code TokenizerFactory}
   */
  public static TokenizerFactory getDefaultInstance() { return DEFAULT_INSTANCE; }
  
  private TokenizerFactory(String delim, String skip, Locale locale,
      Set indices, boolean keep) {
    this.delim = delim;
    this.skip = skip;
    this.locale = locale;
    this.indices = indices;
    this.keep = keep;
  }
  
  /**
   * Return a {@code Scanner} instance that wraps the input string and uses the delimiter,
   * skip, and locale settings for this {@code TokenizerFactory} instance.
   * 
   * @param input The input string
   * @return A new {@code Scanner} instance with appropriate settings
   */
  public Tokenizer create(String input) {
    Scanner s = new Scanner(input);
    if (delim != null) {
      s.useDelimiter(delim);
    }
    if (skip != null) {
      s.skip(skip);
    }
    if (locale != null) {
      s.useLocale(locale);
    }
    return new Tokenizer(s, indices, keep);
  }

  /**
   * Factory method for creating a {@code TokenizerFactory.Builder} instance.
   * @return A new {@code TokenizerFactory.Builder}
   */
  public static Builder builder() {
    return new Builder();
  }
  
  /**
   * A class for constructing new {@code TokenizerFactory} instances using the Builder pattern.
   */
  public static class Builder {
    private String delim;
    private String skip;
    private Locale locale;
    private Set indices = ImmutableSet.of();
    private boolean keep;
    
    /**
     * Sets the delimiter used by the {@code TokenizerFactory} instances constructed by
     * this instance.
     * @param delim The delimiter to use, which may be a regular expression
     * @return This {@code Builder} instance
     */
    public Builder delimiter(String delim) {
      this.delim = delim;
      return this;
    }
    
    /**
     * Sets the regular expression that determines which input characters should be
     * ignored by the {@code Scanner} that is returned by the constructed
     * {@code TokenizerFactory}.
     * 
     * @param skip The regular expression of input values to ignore
     * @return This {@code Builder} instance
     */
    public Builder skip(String skip) {
      this.skip = skip;
      return this;
    }
    
    /**
     * Sets the {@code Locale} to use with the {@code TokenizerFactory} returned by
     * this {@code Builder} instance.
     * 
     * @param locale The locale to use
     * @return This {@code Builder} instance
     */
    public Builder locale(Locale locale) {
      this.locale = locale;
      return this;
    }
    
    /**
     * Keep only the specified fields found by the input scanner, counting from
     * zero.
     * 
     * @param indices The indices to keep
     * @return This {@code Builder} instance
     */
    public Builder keep(Integer... indices) {
      Preconditions.checkArgument(this.indices.isEmpty(),
          "Cannot set keep indices more than once");
      this.indices = ImmutableSet.copyOf(indices);
      this.keep = true;
      return this;
    }
    
    /**
     * Drop the specified fields found by the input scanner, counting from zero.
     * 
     * @param indices The indices to drop
     * @return This {@code Builder} instance
     */
    public Builder drop(Integer... indices) {
      Preconditions.checkArgument(this.indices.isEmpty(),
          "Cannot set drop indices more than once");
      this.indices = ImmutableSet.copyOf(indices);
      this.keep = false;
      return this;
    }
    
    /**
     * Returns a new {@code TokenizerFactory} with settings determined by this
     * {@code Builder} instance.
     * @return A new {@code TokenizerFactory}
     */
    public TokenizerFactory build() {
      return new TokenizerFactory(delim, skip, locale, indices, keep);
    }
  }
  
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy