org.apache.hadoop.mapreduce.lib.partition.KeyFieldHelper Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.lib.partition;

import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.ArrayList;
import java.util.StringTokenizer;

import org.apache.hadoop.util.UTF8ByteArrayUtils;

/**
 * This is used in {@link KeyFieldBasedComparator} & 
 * {@link KeyFieldBasedPartitioner}. Defines all the methods
 * for parsing key specifications. The key specification is of the form:
 * -k pos1[,pos2], where pos is of the form f[.c][opts], where f is the number
 *  of the field to use, and c is the number of the first character from the
 *  beginning of the field. Fields and character posns are numbered starting
 *  with 1; a character position of zero in pos2 indicates the field's last
 *  character. If '.c' is omitted from pos1, it defaults to 1 (the beginning
 *  of the field); if omitted from pos2, it defaults to 0 (the end of the
 *  field). opts are ordering options (supported options are 'nr'). 
 */

class KeyFieldHelper {
  
  protected static class KeyDescription {
    int beginFieldIdx = 1;
    int beginChar = 1;
    int endFieldIdx = 0;
    int endChar = 0;
    boolean numeric;
    boolean reverse;
    @Override
    public String toString() {
      return "-k" 
             + beginFieldIdx + "." + beginChar + "," 
             + endFieldIdx + "." + endChar 
             + (numeric ? "n" : "") + (reverse ? "r" : "");
    }
  }
  
  private List allKeySpecs = new ArrayList();
  private byte[] keyFieldSeparator;
  private boolean keySpecSeen = false;
  
  public void setKeyFieldSeparator(String keyFieldSeparator) {
    try {
      this.keyFieldSeparator =
        keyFieldSeparator.getBytes("UTF-8");
    } catch (UnsupportedEncodingException e) {
      throw new RuntimeException("The current system does not " +
          "support UTF-8 encoding!", e);
    }    
  }
  
  /** Required for backcompatibility with num.key.fields.for.partition in
   * {@link KeyFieldBasedPartitioner} */
  public void setKeyFieldSpec(int start, int end) {
    if (end >= start) {
      KeyDescription k = new KeyDescription();
      k.beginFieldIdx = start;
      k.endFieldIdx = end;
      keySpecSeen = true;
      allKeySpecs.add(k);
    }
  }
  
  public List keySpecs() {
    return allKeySpecs;
  }
    
  public int[] getWordLengths(byte []b, int start, int end) {
    //Given a string like "hello how are you", it returns an array
    //like [4 5, 3, 3, 3], where the first element is the number of
	//fields
    if (!keySpecSeen) {
      //if there were no key specs, then the whole key is one word
      return new int[] {1};
    }
    int[] lengths = new int[10];
    int currLenLengths = lengths.length;
    int idx = 1;
    int pos;
    while ((pos = UTF8ByteArrayUtils.findBytes(b, start, end, 
        keyFieldSeparator)) != -1) {
      if (++idx == currLenLengths) {
        int[] temp = lengths;
        lengths = new int[(currLenLengths = currLenLengths*2)];
        System.arraycopy(temp, 0, lengths, 0, temp.length);
      }
      lengths[idx - 1] = pos - start;
      start = pos + 1;
    }
    
    if (start != end) {
      lengths[idx] = end - start;
    }
    lengths[0] = idx; //number of words is the first element
    return lengths;
  }
  public int getStartOffset(byte[]b, int start, int end, 
      int []lengthIndices, KeyDescription k) {
    //if -k2.5,2 is the keyspec, the startChar is lengthIndices[1] + 5
    //note that the [0]'th element is the number of fields in the key
    if (lengthIndices[0] >= k.beginFieldIdx) {
      int position = 0;
      for (int i = 1; i < k.beginFieldIdx; i++) {
        position += lengthIndices[i] + keyFieldSeparator.length; 
      }
      if (position + k.beginChar <= (end - start)) {
        return start + position + k.beginChar - 1; 
      }
    }
    return -1;
  }
  public int getEndOffset(byte[]b, int start, int end, 
      int []lengthIndices, KeyDescription k) {
    //if -k2,2.8 is the keyspec, the endChar is lengthIndices[1] + 8
    //note that the [0]'th element is the number of fields in the key
    if (k.endFieldIdx == 0) {
      //there is no end field specified for this keyspec. So the remaining
      //part of the key is considered in its entirety.
      return end - 1; 
    }
    if (lengthIndices[0] >= k.endFieldIdx) {
      int position = 0;
      int i;
      for (i = 1; i < k.endFieldIdx; i++) {
        position += lengthIndices[i] + keyFieldSeparator.length;
      }
      if (k.endChar == 0) { 
        position += lengthIndices[i];
      }
      if (position + k.endChar <= (end - start)) {
        return start + position + k.endChar - 1;
      }
      return end - 1;
    }
    return end - 1;
  }
  public void parseOption(String option) {
    if (option == null || option.equals("")) {
      //we will have only default comparison
      return;
    }
    StringTokenizer args = new StringTokenizer(option);
    KeyDescription global = new KeyDescription();
    while (args.hasMoreTokens()) {
      String arg = args.nextToken();
      if (arg.equals("-n")) {  
        global.numeric = true;
      }
      if (arg.equals("-r")) {
        global.reverse = true;
      }
      if (arg.equals("-nr")) {
        global.numeric = true;
        global.reverse = true;
      }
      if (arg.startsWith("-k")) {
        KeyDescription k = parseKey(arg, args);
        if (k != null) {
          allKeySpecs.add(k);
          keySpecSeen = true;
        }
      }
    }
    for (KeyDescription key : allKeySpecs) {
      if (!(key.reverse | key.numeric)) {
        key.reverse = global.reverse;
        key.numeric = global.numeric;
      }
    }
    if (allKeySpecs.size() == 0) {
      allKeySpecs.add(global);
    }
  }
  
  private KeyDescription parseKey(String arg, StringTokenizer args) {
    //we allow for -k and -k 
    String keyArgs = null;
    if (arg.length() == 2) {
      if (args.hasMoreTokens()) {
        keyArgs = args.nextToken();
      }
    } else {
      keyArgs = arg.substring(2);
    }
    if (keyArgs == null || keyArgs.length() == 0) {
      return null;
    }
    StringTokenizer st = new StringTokenizer(keyArgs,"nr.,",true);
       
    KeyDescription key = new KeyDescription();
    
    String token;
    //the key is of the form 1[.3][nr][,1.5][nr]
    if (st.hasMoreTokens()) {
      token = st.nextToken();
      //the first token must be a number
      key.beginFieldIdx = Integer.parseInt(token);
    }
    if (st.hasMoreTokens()) {
      token = st.nextToken();
      if (token.equals(".")) {
        token = st.nextToken();
        key.beginChar = Integer.parseInt(token);
        if (st.hasMoreTokens()) {
          token = st.nextToken();
        } else {
          return key;
        }
      } 
      do {
        if (token.equals("n")) {
          key.numeric = true;
        }
        else if (token.equals("r")) {
          key.reverse = true;
        }
        else break;
        if (st.hasMoreTokens()) {
          token = st.nextToken();
        } else {
          return key;
        }
      } while (true);
      if (token.equals(",")) {
        token = st.nextToken();
        //the first token must be a number
        key.endFieldIdx = Integer.parseInt(token);
        if (st.hasMoreTokens()) {
          token = st.nextToken();
          if (token.equals(".")) {
            token = st.nextToken();
            key.endChar = Integer.parseInt(token);
            if (st.hasMoreTokens()) {
              token = st.nextToken();
            } else {
              return key;
            }
          }
          do {
            if (token.equals("n")) {
              key.numeric = true;
            }
            else if (token.equals("r")) {
              key.reverse = true;
            }
            else { 
              throw new IllegalArgumentException("Invalid -k argument. " +
               "Must be of the form -k pos1,[pos2], where pos is of the form " +
               "f[.c]nr");
            }
            if (st.hasMoreTokens()) {
              token = st.nextToken();
            } else {
              break;
            }
          } while (true);
        }
        return key;
      }
      throw new IllegalArgumentException("Invalid -k argument. " +
          "Must be of the form -k pos1,[pos2], where pos is of the form " +
          "f[.c]nr");
    }
    return key;
  }
  private void printKey(KeyDescription key) {
    System.out.println("key.beginFieldIdx: " + key.beginFieldIdx);
    System.out.println("key.beginChar: " + key.beginChar);
    System.out.println("key.endFieldIdx: " + key.endFieldIdx);
    System.out.println("key.endChar: " + key.endChar);
    System.out.println("key.numeric: " + key.numeric);
    System.out.println("key.reverse: " + key.reverse);
    System.out.println("parseKey over");
  }  
}