org.apache.hadoop.hive.ql.udf.UDFParseUrl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
Show all versions of hive-exec Show documentation
Hive is a data warehouse infrastructure built on top of Hadoop see
http://wiki.apache.org/hadoop/Hive
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.udf;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* UDF to extract specfic parts from URL For example,
* parse_url('http://facebook.com/path/p1.php?query=1', 'HOST') will return
* 'facebook.com' For example,
* parse_url('http://facebook.com/path/p1.php?query=1', 'PATH') will return
* '/path/p1.php' parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')
* will return 'query=1'
* parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'REF') will return
* 'Ref' parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'PROTOCOL')
* will return 'http' Possible values are
* HOST,PATH,QUERY,REF,PROTOCOL,AUTHORITY,FILE,USERINFO Also you can get a value
* of particular key in QUERY, using syntax QUERY: eg: QUERY:k1.
*/
@Description(name = "parse_url",
value = "_FUNC_(url, partToExtract[, key]) - extracts a part from a URL",
extended = "Parts: HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, "
+ "USERINFO\nkey specifies which query to extract\n"
+ "Example:\n"
+ " > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
+ "'HOST') FROM src LIMIT 1;\n"
+ " 'facebook.com'\n"
+ " > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
+ "'QUERY') FROM src LIMIT 1;\n"
+ " 'query=1'\n"
+ " > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
+ "'QUERY', 'query') FROM src LIMIT 1;\n" + " '1'")
public class UDFParseUrl extends UDF {
private String lastUrlStr = null;
private URL url = null;
private Pattern p = null;
private String lastKey = null;
public UDFParseUrl() {
}
public String evaluate(String urlStr, String partToExtract) {
if (urlStr == null || partToExtract == null) {
return null;
}
if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
try {
url = new URL(urlStr);
} catch (Exception e) {
return null;
}
}
lastUrlStr = urlStr;
if (partToExtract.equals("HOST")) {
return url.getHost();
}
if (partToExtract.equals("PATH")) {
return url.getPath();
}
if (partToExtract.equals("QUERY")) {
return url.getQuery();
}
if (partToExtract.equals("REF")) {
return url.getRef();
}
if (partToExtract.equals("PROTOCOL")) {
return url.getProtocol();
}
if (partToExtract.equals("FILE")) {
return url.getFile();
}
if (partToExtract.equals("AUTHORITY")) {
return url.getAuthority();
}
if (partToExtract.equals("USERINFO")) {
return url.getUserInfo();
}
return null;
}
public String evaluate(String urlStr, String partToExtract, String key) {
if (!partToExtract.equals("QUERY")) {
return null;
}
String query = this.evaluate(urlStr, partToExtract);
if (query == null) {
return null;
}
if (!key.equals(lastKey)) {
p = Pattern.compile("(&|^)" + key + "=([^&]*)");
}
lastKey = key;
Matcher m = p.matcher(query);
if (m.find()) {
return m.group(2);
}
return null;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy