All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.contrib.text.AbstractCompositeExtractor Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.contrib.text;

import java.util.List;

import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * Base class for {@code Extractor} instances that delegates the parsing of fields to other
 * {@code Extractor} instances, primarily used for constructing composite records that implement
 * the {@code Tuple} interface.
 */
public abstract class AbstractCompositeExtractor implements Extractor {

  private final TokenizerFactory tokenizerFactory;
  private int errors = 0;
  private boolean errorOnLast;
  private final List> extractors;
  
  public AbstractCompositeExtractor(TokenizerFactory scannerFactory, List> extractors) {
    Preconditions.checkArgument(extractors.size() > 0);
    this.tokenizerFactory = scannerFactory;
    this.extractors = extractors;
  }

  @Override
  public T extract(String input) {
    errorOnLast = false;
    Tokenizer tokenizer = tokenizerFactory.create(input);
    Object[] values = new Object[extractors.size()];
    try {
      for (int i = 0; i < values.length; i++) {
        values[i] = extractors.get(i).extract(tokenizer.next());
        if (extractors.get(i).errorOnLastRecord() && !errorOnLast) {
          errors++;
          errorOnLast = true;
        }
      }
    } catch (Exception e) {
      if (!errorOnLast) {
        errors++;
        errorOnLast = true;
      }
      return getDefaultValue();
    }
    
    return doCreate(values);
  }
  
  @Override
  public void initialize() {
    this.errors = 0;
    this.errorOnLast = false;
    for (Extractor x : extractors) {
      x.initialize();
    }
  }
  
  @Override
  public boolean errorOnLastRecord() {
    return errorOnLast;
  }
  
  @Override
  public ExtractorStats getStats() {
    return new ExtractorStats(errors, Lists.transform(extractors, new Function, Integer>() {
      @Override
      public Integer apply(Extractor input) {
        return input.getStats().getErrorCount();
      }
    }));
  }
  
  /**
   * Subclasses should return a new instance of the object based on the fields parsed by
   * the {@code Extractor} instances for this composite {@code Extractor} instance.
   * 
   * @param values The values that were extracted by the component {@code Extractor} objects
   * @return A new instance of the composite class for this {@code Extractor}
   */
  protected abstract T doCreate(Object[] values);
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy