All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.impl.streaming.OutputHandler Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.streaming;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.pig.PigStreamingBase;
import org.apache.pig.StreamToPig;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.io.BufferedPositionedInputStream;

import com.google.common.base.Charsets;

/**
 * {@link OutputHandler} is responsible for handling the output of the
 * Pig-Streaming external command.
 *
 * The output of the managed executable could be fetched in a
 * {@link OutputType#SYNCHRONOUS} manner via its stdout or in an
 * {@link OutputType#ASYNCHRONOUS} manner via an external file to which the
 * process wrote its output.
 */
public abstract class OutputHandler {
    public static final Object END_OF_OUTPUT = new Object();
    private static final byte[] DEFAULT_RECORD_DELIM = new byte[] {'\n'};

    public enum OutputType {SYNCHRONOUS, ASYNCHRONOUS}

    /*
     * The deserializer to be used to send data to the managed process.
     *
     * It is the responsibility of the concrete sub-classes to setup and
     * manage the deserializer.
     */
    protected StreamToPig deserializer;

    private PigStreamingBase newDeserializer;

    protected LineReader in = null;

    private Text currValue = new Text();

    private BufferedPositionedInputStream istream;
    
    //Both of these ignore the trailing \n.  So if the
    //default delimiter is "\n" recordDelimStr is "".
    private String recordDelimStr = null;
    private int recordDelimLength = 0;

    /**
     * Get the handled OutputType.
     * @return the handled OutputType
     */
    public abstract OutputType getOutputType();

    // flag to mark if close() has already been called
    protected boolean alreadyClosed = false;

    /**
     * Bind the OutputHandler to the InputStream
     * from which to read the output data of the managed process.
     *
     * @param is InputStream from which to read the output data
     *           of the managed process
     * @throws IOException
     */
    public void bindTo(String fileName, BufferedPositionedInputStream is,
                       long offset, long end) throws IOException {
        this.istream  = is;
        this.in = new LineReader(istream);
        if (this.deserializer instanceof PigStreamingBase) {
            this.newDeserializer = (PigStreamingBase) deserializer;
        }
    }

    /**
     * Get the next output Tuple of the managed process.
     *
     * @return the next output Tuple of the managed process
     * @throws IOException
     */
    public Tuple getNext() throws IOException {
        if (in == null) {
            return null;
        }

        currValue.clear();
        if (!readValue()) {
            return null;
        }

        if (newDeserializer != null) {
            return newDeserializer.deserialize(currValue.getBytes(), 0, currValue.getLength());
        } else {
            byte[] newBytes = new byte[currValue.getLength()];
            System.arraycopy(currValue.getBytes(), 0, newBytes, 0, currValue.getLength());
            return deserializer.deserialize(newBytes);
        }
    }

    private boolean readValue() throws IOException {
        int num = in.readLine(currValue);
        if (num <= 0) {
            return false;
        }

        while(!isEndOfRow()) {
            //Need to add back the newline character we ate.
            currValue.append(new byte[] {'\n'}, 0, 1);

            byte[] lineBytes = readNextLine();
            if (lineBytes == null) {
                //We have no more input, so just break;
                break;
            }
            currValue.append(lineBytes, 0, lineBytes.length);
        }
        
        return true;
    }
    
    private byte[] readNextLine() throws IOException {
        Text line = new Text();
        int num = in.readLine(line);
        byte[] lineBytes = line.getBytes();
        if (num <= 0) {
            return null;
        }
        
        return lineBytes;
    }

    private boolean isEndOfRow() {
        if (recordDelimStr == null) {
            byte[] recordDelimBa = getRecordDelimiter();
            recordDelimLength = recordDelimBa.length - 1; //Ignore trailing \n
            recordDelimStr = new String(recordDelimBa, 0, recordDelimLength,  Charsets.UTF_8);
        }
        if (recordDelimLength == 0 || currValue.getLength() < recordDelimLength) {
            return true;
        }
        return currValue.find(recordDelimStr, currValue.getLength() - recordDelimLength) >= 0;
    }
    
    protected byte[] getRecordDelimiter() {
        return DEFAULT_RECORD_DELIM;
    }

    /**
     * Close the OutputHandler.
     * @throws IOException
     */
    public synchronized void close() throws IOException {
        if(!alreadyClosed) {
            istream.close();
            istream = null;
            alreadyClosed = true;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy