All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.contrib.text.Tokenizer Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.contrib.text;

import java.util.Scanner;
import java.util.Set;

/**
 * Manages a {@link Scanner} instance and provides support for returning only a subset
 * of the fields returned by the underlying {@code Scanner}.
 */
public class Tokenizer {

  private final Scanner scanner;
  private final Set indices;
  private final boolean keep;
  private int current;
  
  /**
   * Create a new {@code Tokenizer} instance.
   * 
   * @param scanner The scanner to manage
   * @param indices The indices to keep/drop
   * @param keep Whether the indices should be kept (true) or dropped (false)
   */
  public Tokenizer(Scanner scanner, Set indices, boolean keep) {
    this.scanner = scanner;
    this.indices = checkIndices(indices);
    this.keep = keep;
    this.current = -1;
  }
  
  private static Set checkIndices(Set indices) {
    for (Integer index : indices) {
      if (index < 0) {
        throw new IllegalArgumentException("All tokenizer indices must be non-negative");
      }
    }
    return indices;
  }
  
  private void advance() {
    if (indices.isEmpty()) {
      return;
    }
    current++;
    while (scanner.hasNext() &&
        (keep && !indices.contains(current)) || (!keep && indices.contains(current))) {
      scanner.next();
      current++;
    }
  }
  
  /**
   * Returns true if the underlying {@code Scanner} has any tokens remaining.
   */
  public boolean hasNext() {
    return scanner.hasNext();
  }

  /**
   * Advance this {@code Tokenizer} and return the next String from the {@code Scanner}.
   * 
   * @return The next String from the {@code Scanner}
   */
  public String next() {
    advance();
    return scanner.next();
  }

  /**
   * Advance this {@code Tokenizer} and return the next Long from the {@code Scanner}.
   * 
   * @return The next Long from the {@code Scanner}
   */
  public Long nextLong() {
    advance();
    return scanner.nextLong();
  }

  /**
   * Advance this {@code Tokenizer} and return the next Boolean from the {@code Scanner}.
   * 
   * @return The next Boolean from the {@code Scanner}
   */
  public Boolean nextBoolean() {
    advance();
    return scanner.nextBoolean();
  }

  /**
   * Advance this {@code Tokenizer} and return the next Double from the {@code Scanner}.
   * 
   * @return The next Double from the {@code Scanner}
   */
  public Double nextDouble() {
    advance();
    return scanner.nextDouble();
  }

  /**
   * Advance this {@code Tokenizer} and return the next Float from the {@code Scanner}.
   * 
   * @return The next Float from the {@code Scanner}
   */
  public Float nextFloat() {
    advance();
    return scanner.nextFloat();
  }

  /**
   * Advance this {@code Tokenizer} and return the next Integer from the {@code Scanner}.
   * 
   * @return The next Integer from the {@code Scanner}
   */
  public Integer nextInt() {
    advance();
    return scanner.nextInt();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy