org.apache.hadoop.streaming.PipeMapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-streaming Show documentation
Apache Hadoop MapReduce Streaming
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.streaming;

import java.io.*;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.SkipBadRecords;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.streaming.io.InputWriter;
import org.apache.hadoop.streaming.io.OutputReader;
import org.apache.hadoop.streaming.io.TextInputWriter;

/** A generic Mapper bridge.
 *  It delegates operations to an external program via stdin and stdout.
 */
public class PipeMapper extends PipeMapRed implements Mapper {

  private boolean ignoreKey = false;
  private boolean skipping = false;

  private byte[] mapOutputFieldSeparator;
  private byte[] mapInputFieldSeparator;
  private int numOfMapOutputKeyFields = 1;
  
  String getPipeCommand(JobConf job) {
    String str = job.get("stream.map.streamprocessor");
    if (str == null) {
      return str;
    }
    try {
      return URLDecoder.decode(str, "UTF-8");
    }
    catch (UnsupportedEncodingException e) {
      System.err.println("stream.map.streamprocessor in jobconf not found");
      return null;
    }
  }

  boolean getDoPipe() {
    return true;
  }
  
  public void configure(JobConf job) {
    super.configure(job);
    //disable the auto increment of the counter. For streaming, no of 
    //processed records could be different(equal or less) than the no of 
    //records input.
    SkipBadRecords.setAutoIncrMapperProcCount(job, false);
    skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false);
    if (mapInputWriterClass_.getCanonicalName().equals(TextInputWriter.class.getCanonicalName())) {
      String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName();
      ignoreKey = job.getBoolean("stream.map.input.ignoreKey", 
        inputFormatClassName.equals(TextInputFormat.class.getCanonicalName()));
    }
    
    mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t")
            .getBytes(StandardCharsets.UTF_8);
    mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t")
            .getBytes(StandardCharsets.UTF_8);
    numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1);
  }

  // Do NOT declare default constructor
  // (MapRed creates it reflectively)

  public void map(Object key, Object value, OutputCollector output, Reporter reporter) throws IOException {
    if (outerrThreadsThrowable != null) {
      mapRedFinished();
      throw new IOException("MROutput/MRErrThread failed:",
          outerrThreadsThrowable);
    }
    try {
      // 1/4 Hadoop in
      numRecRead_++;
      maybeLogRecord();

      // 2/4 Hadoop to Tool
      if (numExceptions_ == 0) {
        if (!this.ignoreKey) {
          inWriter_.writeKey(key);
        }
        inWriter_.writeValue(value);
        if(skipping) {
          //flush the streams on every record input if running in skip mode
          //so that we don't buffer other records surrounding a bad record. 
          clientOut_.flush();
        }
      } else {
        numRecSkipped_++;
      }
    } catch (IOException io) {
      numExceptions_++;
      if (numExceptions_ > 1 || numRecWritten_ < minRecWrittenToEnableSkip_) {
        // terminate with failure
        LOG.info(getContext() , io);
        mapRedFinished();
        throw io;
      } else {
        // terminate with success:
        // swallow input records although the stream processor failed/closed
      }
    }
  }

  public void close() {
    mapRedFinished();
  }

  @Override
  public byte[] getInputSeparator() {
    return mapInputFieldSeparator;
  }

  @Override
  public byte[] getFieldSeparator() {
    return mapOutputFieldSeparator;
  }

  @Override
  public int getNumOfKeyFields() {
    return numOfMapOutputKeyFields;
  }

  @Override
  InputWriter createInputWriter() throws IOException {
    return super.createInputWriter(mapInputWriterClass_);
  }

  @Override
  OutputReader createOutputReader() throws IOException {
    return super.createOutputReader(mapOutputReaderClass_);
  }

}