org.apache.hadoop.hive.ql.udf.generic.GenericUDFSentences Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.udf.generic;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Locale;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
/**
* GenericUDFSentences: splits a natural language chunk of text into sentences and words.
*
*/
@Description(name = "sentences", value = "_FUNC_(str, lang, country) - Splits str"
+ " into arrays of sentences, where each sentence is an array of words. The 'lang' and"
+ "'country' arguments are optional, and if omitted, the default locale is used.",
extended = "Example:\n"
+ " > SELECT _FUNC_('Hello there! I am a UDF.') FROM src LIMIT 1;\n"
+ " [ [\"Hello\", \"there\"], [\"I\", \"am\", \"a\", \"UDF\"] ]\n"
+ " > SELECT _FUNC_(review, language) FROM movies;\n"
+ "Unnecessary punctuation, such as periods and commas in English, is automatically stripped."
+ " If specified, 'lang' should be a two-letter ISO-639 language code (such as 'en'), and "
+ "'country' should be a two-letter ISO-3166 code (such as 'us'). Not all country and "
+ "language codes are fully supported, and if an unsupported code is specified, a default "
+ "locale is used to process that string.")
public class GenericUDFSentences extends GenericUDF {
private transient ObjectInspectorConverters.Converter[] converters;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length < 1 || arguments.length > 3) {
throw new UDFArgumentLengthException(
"The function sentences takes between 1 and 3 arguments.");
}
converters = new ObjectInspectorConverters.Converter[arguments.length];
for (int i = 0; i < arguments.length; i++) {
converters[i] = ObjectInspectorConverters.getConverter(arguments[i],
PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
return ObjectInspectorFactory.getStandardListObjectInspector(
ObjectInspectorFactory.getStandardListObjectInspector(
PrimitiveObjectInspectorFactory.writableStringObjectInspector));
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
assert (arguments.length >= 1 && arguments.length <= 3);
if (arguments[0].get() == null) {
return null;
}
// if there is more than 1 argument specified, a different natural language
// locale is being specified
Locale locale = null;
if(arguments.length > 1 && arguments[1].get() != null) {
Text language = (Text) converters[1].convert(arguments[1].get());
Text country = null;
if(arguments.length > 2 && arguments[2].get() != null) {
country = (Text) converters[2].convert(arguments[2].get());
}
if(country != null) {
locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase());
} else {
locale = new Locale(language.toString().toLowerCase());
}
} else {
locale = Locale.getDefault();
}
// get the input and prepare the output
Text chunk = (Text) converters[0].convert(arguments[0].get());
String text = chunk.toString();
ArrayList > result = new ArrayList >();
// Parse out sentences using Java's text-handling API
BreakIterator bi = BreakIterator.getSentenceInstance(locale);
bi.setText(text);
int idx = 0;
while(bi.next() != BreakIterator.DONE) {
String sentence = text.substring(idx, bi.current());
idx = bi.current();
result.add(new ArrayList());
// Parse out words in the sentence
BreakIterator wi = BreakIterator.getWordInstance(locale);
wi.setText(sentence);
int widx = 0;
ArrayList sent_array = result.get(result.size()-1);
while(wi.next() != BreakIterator.DONE) {
String word = sentence.substring(widx, wi.current());
widx = wi.current();
if(Character.isLetterOrDigit(word.charAt(0))) {
sent_array.add(new Text(word));
}
}
}
return result;
}
@Override
public String getDisplayString(String[] children) {
assert (children.length >= 1 && children.length <= 3);
return getStandardDisplayString("sentences", children);
}
}