All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.udf.UDFParseUrl Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf;

import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;

/**
 * UDF to extract specfic parts from URL For example,
 * parse_url('http://facebook.com/path/p1.php?query=1', 'HOST') will return
 * 'facebook.com' For example,
 * parse_url('http://facebook.com/path/p1.php?query=1', 'PATH') will return
 * '/path/p1.php' parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')
 * will return 'query=1'
 * parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'REF') will return
 * 'Ref' parse_url('http://facebook.com/path/p1.php?query=1#Ref', 'PROTOCOL')
 * will return 'http' Possible values are
 * HOST,PATH,QUERY,REF,PROTOCOL,AUTHORITY,FILE,USERINFO Also you can get a value
 * of particular key in QUERY, using syntax QUERY: eg: QUERY:k1.
 */
@Description(name = "parse_url",
    value = "_FUNC_(url, partToExtract[, key]) - extracts a part from a URL",
    extended = "Parts: HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, "
    + "USERINFO\nkey specifies which query to extract\n"
    + "Example:\n"
    + "  > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
    + "'HOST') FROM src LIMIT 1;\n"
    + "  'facebook.com'\n"
    + "  > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
    + "'QUERY') FROM src LIMIT 1;\n"
    + "  'query=1'\n"
    + "  > SELECT _FUNC_('http://facebook.com/path/p1.php?query=1', "
    + "'QUERY', 'query') FROM src LIMIT 1;\n" + "  '1'")
public class UDFParseUrl extends UDF {
  private String lastUrlStr = null;
  private URL url = null;
  private Pattern p = null;
  private String lastKey = null;

  public UDFParseUrl() {
  }

  public String evaluate(String urlStr, String partToExtract) {
    if (urlStr == null || partToExtract == null) {
      return null;
    }

    if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
      try {
        url = new URL(urlStr);
      } catch (Exception e) {
        return null;
      }
    }
    lastUrlStr = urlStr;

    if (partToExtract.equals("HOST")) {
      return url.getHost();
    }
    if (partToExtract.equals("PATH")) {
      return url.getPath();
    }
    if (partToExtract.equals("QUERY")) {
      return url.getQuery();
    }
    if (partToExtract.equals("REF")) {
      return url.getRef();
    }
    if (partToExtract.equals("PROTOCOL")) {
      return url.getProtocol();
    }
    if (partToExtract.equals("FILE")) {
      return url.getFile();
    }
    if (partToExtract.equals("AUTHORITY")) {
      return url.getAuthority();
    }
    if (partToExtract.equals("USERINFO")) {
      return url.getUserInfo();
    }

    return null;
  }

  public String evaluate(String urlStr, String partToExtract, String key) {
    if (!partToExtract.equals("QUERY")) {
      return null;
    }

    String query = this.evaluate(urlStr, partToExtract);
    if (query == null) {
      return null;
    }

    if (!key.equals(lastKey)) {
      p = Pattern.compile("(&|^)" + key + "=([^&]*)");
    }

    lastKey = key;
    Matcher m = p.matcher(query);
    if (m.find()) {
      return m.group(2);
    }
    return null;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy