org.apache.hadoop.mapreduce.lib.partition.KeyFieldHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-apache Show documentation
Show all versions of hadoop-apache Show documentation
Shaded version of Apache Hadoop for Presto
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.lib.partition;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.hadoop.util.UTF8ByteArrayUtils;
/**
* This is used in {@link KeyFieldBasedComparator} &
* {@link KeyFieldBasedPartitioner}. Defines all the methods
* for parsing key specifications. The key specification is of the form:
* -k pos1[,pos2], where pos is of the form f[.c][opts], where f is the number
* of the field to use, and c is the number of the first character from the
* beginning of the field. Fields and character posns are numbered starting
* with 1; a character position of zero in pos2 indicates the field's last
* character. If '.c' is omitted from pos1, it defaults to 1 (the beginning
* of the field); if omitted from pos2, it defaults to 0 (the end of the
* field). opts are ordering options (supported options are 'nr').
*/
class KeyFieldHelper {
protected static class KeyDescription {
int beginFieldIdx = 1;
int beginChar = 1;
int endFieldIdx = 0;
int endChar = 0;
boolean numeric;
boolean reverse;
@Override
public String toString() {
return "-k"
+ beginFieldIdx + "." + beginChar + ","
+ endFieldIdx + "." + endChar
+ (numeric ? "n" : "") + (reverse ? "r" : "");
}
}
private List allKeySpecs = new ArrayList();
private byte[] keyFieldSeparator;
private boolean keySpecSeen = false;
public void setKeyFieldSeparator(String keyFieldSeparator) {
try {
this.keyFieldSeparator =
keyFieldSeparator.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("The current system does not " +
"support UTF-8 encoding!", e);
}
}
/** Required for backcompatibility with num.key.fields.for.partition in
* {@link KeyFieldBasedPartitioner} */
public void setKeyFieldSpec(int start, int end) {
if (end >= start) {
KeyDescription k = new KeyDescription();
k.beginFieldIdx = start;
k.endFieldIdx = end;
keySpecSeen = true;
allKeySpecs.add(k);
}
}
public List keySpecs() {
return allKeySpecs;
}
public int[] getWordLengths(byte []b, int start, int end) {
//Given a string like "hello how are you", it returns an array
//like [4 5, 3, 3, 3], where the first element is the number of
//fields
if (!keySpecSeen) {
//if there were no key specs, then the whole key is one word
return new int[] {1};
}
int[] lengths = new int[10];
int currLenLengths = lengths.length;
int idx = 1;
int pos;
while ((pos = UTF8ByteArrayUtils.findBytes(b, start, end,
keyFieldSeparator)) != -1) {
if (++idx == currLenLengths) {
int[] temp = lengths;
lengths = new int[(currLenLengths = currLenLengths*2)];
System.arraycopy(temp, 0, lengths, 0, temp.length);
}
lengths[idx - 1] = pos - start;
start = pos + 1;
}
if (start != end) {
lengths[idx] = end - start;
}
lengths[0] = idx; //number of words is the first element
return lengths;
}
public int getStartOffset(byte[]b, int start, int end,
int []lengthIndices, KeyDescription k) {
//if -k2.5,2 is the keyspec, the startChar is lengthIndices[1] + 5
//note that the [0]'th element is the number of fields in the key
if (lengthIndices[0] >= k.beginFieldIdx) {
int position = 0;
for (int i = 1; i < k.beginFieldIdx; i++) {
position += lengthIndices[i] + keyFieldSeparator.length;
}
if (position + k.beginChar <= (end - start)) {
return start + position + k.beginChar - 1;
}
}
return -1;
}
public int getEndOffset(byte[]b, int start, int end,
int []lengthIndices, KeyDescription k) {
//if -k2,2.8 is the keyspec, the endChar is lengthIndices[1] + 8
//note that the [0]'th element is the number of fields in the key
if (k.endFieldIdx == 0) {
//there is no end field specified for this keyspec. So the remaining
//part of the key is considered in its entirety.
return end - 1;
}
if (lengthIndices[0] >= k.endFieldIdx) {
int position = 0;
int i;
for (i = 1; i < k.endFieldIdx; i++) {
position += lengthIndices[i] + keyFieldSeparator.length;
}
if (k.endChar == 0) {
position += lengthIndices[i];
}
if (position + k.endChar <= (end - start)) {
return start + position + k.endChar - 1;
}
return end - 1;
}
return end - 1;
}
public void parseOption(String option) {
if (option == null || option.equals("")) {
//we will have only default comparison
return;
}
StringTokenizer args = new StringTokenizer(option);
KeyDescription global = new KeyDescription();
while (args.hasMoreTokens()) {
String arg = args.nextToken();
if (arg.equals("-n")) {
global.numeric = true;
}
if (arg.equals("-r")) {
global.reverse = true;
}
if (arg.equals("-nr")) {
global.numeric = true;
global.reverse = true;
}
if (arg.startsWith("-k")) {
KeyDescription k = parseKey(arg, args);
if (k != null) {
allKeySpecs.add(k);
keySpecSeen = true;
}
}
}
for (KeyDescription key : allKeySpecs) {
if (!(key.reverse | key.numeric)) {
key.reverse = global.reverse;
key.numeric = global.numeric;
}
}
if (allKeySpecs.size() == 0) {
allKeySpecs.add(global);
}
}
private KeyDescription parseKey(String arg, StringTokenizer args) {
//we allow for -k and -k
String keyArgs = null;
if (arg.length() == 2) {
if (args.hasMoreTokens()) {
keyArgs = args.nextToken();
}
} else {
keyArgs = arg.substring(2);
}
if (keyArgs == null || keyArgs.length() == 0) {
return null;
}
StringTokenizer st = new StringTokenizer(keyArgs,"nr.,",true);
KeyDescription key = new KeyDescription();
String token;
//the key is of the form 1[.3][nr][,1.5][nr]
if (st.hasMoreTokens()) {
token = st.nextToken();
//the first token must be a number
key.beginFieldIdx = Integer.parseInt(token);
}
if (st.hasMoreTokens()) {
token = st.nextToken();
if (token.equals(".")) {
token = st.nextToken();
key.beginChar = Integer.parseInt(token);
if (st.hasMoreTokens()) {
token = st.nextToken();
} else {
return key;
}
}
do {
if (token.equals("n")) {
key.numeric = true;
}
else if (token.equals("r")) {
key.reverse = true;
}
else break;
if (st.hasMoreTokens()) {
token = st.nextToken();
} else {
return key;
}
} while (true);
if (token.equals(",")) {
token = st.nextToken();
//the first token must be a number
key.endFieldIdx = Integer.parseInt(token);
if (st.hasMoreTokens()) {
token = st.nextToken();
if (token.equals(".")) {
token = st.nextToken();
key.endChar = Integer.parseInt(token);
if (st.hasMoreTokens()) {
token = st.nextToken();
} else {
return key;
}
}
do {
if (token.equals("n")) {
key.numeric = true;
}
else if (token.equals("r")) {
key.reverse = true;
}
else {
throw new IllegalArgumentException("Invalid -k argument. " +
"Must be of the form -k pos1,[pos2], where pos is of the form " +
"f[.c]nr");
}
if (st.hasMoreTokens()) {
token = st.nextToken();
} else {
break;
}
} while (true);
}
return key;
}
throw new IllegalArgumentException("Invalid -k argument. " +
"Must be of the form -k pos1,[pos2], where pos is of the form " +
"f[.c]nr");
}
return key;
}
private void printKey(KeyDescription key) {
System.out.println("key.beginFieldIdx: " + key.beginFieldIdx);
System.out.println("key.beginChar: " + key.beginChar);
System.out.println("key.endFieldIdx: " + key.endFieldIdx);
System.out.println("key.endChar: " + key.endChar);
System.out.println("key.numeric: " + key.numeric);
System.out.println("key.reverse: " + key.reverse);
System.out.println("parseKey over");
}
}