hbase-ruby.hbase.table.rb Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-sdk-shell_1.2 Show documentation
The newest version!
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

include Java

# Wrapper for org.apache.hadoop.hbase.client.Table

module Hbase
  class Table
    include HBaseConstants

    @@thread_pool = nil

    # Add the command 'name' to table s.t. the shell command also called via 'name'
    # and has an internal method also called 'name'.
    #
    # e.g. name = scan, adds table.scan which calls Scan.scan
    def self.add_shell_command(name)
      self.add_command(name, name, name)
    end

    # add a named command to the table instance
    #
    # name - name of the command that should added to the table
    #    (eg. sending 'scan' here would allow you to do table.scan)
    # shell_command - name of the command in the shell
    # internal_method_name - name of the method in the shell command to forward the call
    def self.add_command(name, shell_command, internal_method_name)
      method  = name.to_sym
      self.class_eval do
        define_method method do |*args|
            @shell.internal_command(shell_command, internal_method_name, self, *args)
         end
      end
    end

    # General help for the table
    # class level so we can call it from anywhere
    def self.help
      return <<-EOF
Help for table-reference commands.

You can either create a table via 'create' and then manipulate the table via commands like 'put', 'get', etc.
See the standard help information for how to use each of these commands.

However, as of 0.96, you can also get a reference to a table, on which you can invoke commands.
For instance, you can get create a table and keep around a reference to it via:

   hbase> t = create 't', 'cf'

Or, if you have already created the table, you can get a reference to it:

   hbase> t = get_table 't'

You can do things like call 'put' on the table:

  hbase> t.put 'r', 'cf:q', 'v'

which puts a row 'r' with column family 'cf', qualifier 'q' and value 'v' into table t.

To read the data out, you can scan the table:

  hbase> t.scan

which will read all the rows in table 't'.

Essentially, any command that takes a table name can also be done via table reference.
Other commands include things like: get, delete, deleteall,
get_all_columns, get_counter, count, incr. These functions, along with
the standard JRuby object methods are also available via tab completion.

For more information on how to use each of these commands, you can also just type:

   hbase> t.help 'scan'

which will output more information on how to use that command.

You can also do general admin actions directly on a table; things like enable, disable,
flush and drop just by typing:

   hbase> t.enable
   hbase> t.flush
   hbase> t.disable
   hbase> t.drop

Note that after dropping a table, your reference to it becomes useless and further usage
is undefined (and not recommended).
EOF
    end

    #---------------------------------------------------------------------------------------------

    # let external objects read the underlying table object
    attr_reader :table
    # let external objects read the table name
    attr_reader :name

    def initialize(table, shell)
      @table = table
      @name = @table.getName().getNameAsString()
      @shell = shell
      @converters = Hash.new()
    end

    def close()
      @table.close()
    end

    # Note the below methods are prefixed with '_' to hide them from the average user, as
    # they will be much less likely to tab complete to the 'dangerous' internal method
    #----------------------------------------------------------------------------------------------

    # Put a cell 'value' at specified table/row/column
    def _put_internal(row, column, value, timestamp = nil, args = {})
      p = org.apache.hadoop.hbase.client.Put.new(row.to_s.to_java_bytes)
      family, qualifier = parse_column_name(column)
      if args.any?
         attributes = args[ATTRIBUTES]
         set_attributes(p, attributes) if attributes
         visibility = args[VISIBILITY]
         set_cell_visibility(p, visibility) if visibility
         ttl = args[TTL]
         set_op_ttl(p, ttl) if ttl
      end
      #Case where attributes are specified without timestamp
      if timestamp.kind_of?(Hash)
      	timestamp.each do |k, v|
          if k == 'ATTRIBUTES'
            set_attributes(p, v)
          elsif k == 'VISIBILITY'
            set_cell_visibility(p, v)
          elsif k == "TTL"
            set_op_ttl(p, v)
          end
        end
        timestamp = nil
      end
      if timestamp
        p.add(family, qualifier, timestamp, value.to_s.to_java_bytes)
      else
        p.add(family, qualifier, value.to_s.to_java_bytes)
      end
      @table.put(p)
    end

    #----------------------------------------------------------------------------------------------
    # Delete a cell
    def _delete_internal(row, column,
                timestamp = org.apache.hadoop.hbase.HConstants::LATEST_TIMESTAMP,
                args = {}, all_version = false)
      _deleteall_internal(row, column, timestamp, args, all_version)
    end

    #----------------------------------------------------------------------------------------------
    # Delete a row
    def _deleteall_internal(row, column = nil,
            timestamp = org.apache.hadoop.hbase.HConstants::LATEST_TIMESTAMP,
            args = {}, all_version = true)
      # delete operation doesn't need read permission. Retaining the read check for
      # meta table as a part of HBASE-5837.
      if is_meta_table?
        raise ArgumentError, "Row Not Found" if _get_internal(row).nil?
      end
      temptimestamp = timestamp
      if temptimestamp.kind_of?(Hash)
      	  timestamp = org.apache.hadoop.hbase.HConstants::LATEST_TIMESTAMP
      end
      d = org.apache.hadoop.hbase.client.Delete.new(row.to_s.to_java_bytes, timestamp)
      if temptimestamp.kind_of?(Hash)
      	temptimestamp.each do |k, v|
      	  if v.kind_of?(String)
      	  	set_cell_visibility(d, v) if v
      	  end
      	 end
      end
      if args.any?
         visibility = args[VISIBILITY]
         set_cell_visibility(d, visibility) if visibility
      end
      if column && all_version
        family, qualifier = parse_column_name(column)
        d.deleteColumns(family, qualifier, timestamp)
      elsif column && !all_version
        family, qualifier = parse_column_name(column)
        d.deleteColumn(family, qualifier, timestamp)
      end
      @table.delete(d)
    end

    #----------------------------------------------------------------------------------------------
    # Increment a counter atomically
    def _incr_internal(row, column, value = nil, args={})
      if value.kind_of?(Hash)
      	value = 1
      end
      value ||= 1
      incr = org.apache.hadoop.hbase.client.Increment.new(row.to_s.to_java_bytes)
      family, qualifier = parse_column_name(column)
      if qualifier.nil?
	  	raise ArgumentError, "Failed to provide both column family and column qualifier for incr"
      end
      if args.any?
      	attributes = args[ATTRIBUTES]
      	visibility = args[VISIBILITY]
        set_attributes(incr, attributes) if attributes
        set_cell_visibility(incr, visibility) if visibility
        ttl = args[TTL]
        set_op_ttl(incr, ttl) if ttl
      end
      incr.addColumn(family, qualifier, value)
      result = @table.increment(incr)
      return nil if result.isEmpty

      # Fetch cell value
      cell = result.listCells[0]
      org.apache.hadoop.hbase.util.Bytes::toLong(cell.getValue)
    end

    #----------------------------------------------------------------------------------------------
    # appends the value atomically
    def _append_internal(row, column, value, args={})
      append = org.apache.hadoop.hbase.client.Append.new(row.to_s.to_java_bytes)
      family, qualifier = parse_column_name(column)
      if qualifier.nil?
	  	raise ArgumentError, "Failed to provide both column family and column qualifier for append"
      end
      if args.any?
      	attributes = args[ATTRIBUTES]
      	visibility = args[VISIBILITY]
        set_attributes(append, attributes) if attributes
        set_cell_visibility(append, visibility) if visibility
        ttl = args[TTL]
        set_op_ttl(append, ttl) if ttl
      end
      append.add(family, qualifier, value.to_s.to_java_bytes)
      @table.append(append)
    end

    #----------------------------------------------------------------------------------------------
    # Count rows in a table
    def _count_internal(interval = 1000, caching_rows = 10)
      # We can safely set scanner caching with the first key only filter
      scan = org.apache.hadoop.hbase.client.Scan.new
      scan.setCacheBlocks(false)
      scan.setCaching(caching_rows)
      scan.setFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter.new)

      # Run the scanner
      scanner = @table.getScanner(scan)
      count = 0
      iter = scanner.iterator

      # Iterate results
      while iter.hasNext
        row = iter.next
        count += 1
        next unless (block_given? && count % interval == 0)
        # Allow command modules to visualize counting process
        yield(count,
              org.apache.hadoop.hbase.util.Bytes::toStringBinary(row.getRow))
      end

      scanner.close()
      # Return the counter
      return count
    end

    #----------------------------------------------------------------------------------------------
    # Get from table
    def _get_internal(row, *args)
      get = org.apache.hadoop.hbase.client.Get.new(row.to_s.to_java_bytes)
      maxlength = -1
      @converters.clear()

      # Normalize args
      args = args.first if args.first.kind_of?(Hash)
      if args.kind_of?(String) || args.kind_of?(Array)
        columns = [ args ].flatten.compact
        args = { COLUMNS => columns }
      end

      #
      # Parse arguments
      #
      unless args.kind_of?(Hash)
        raise ArgumentError, "Failed parse of of #{args.inspect}, #{args.class}"
      end

      # Get maxlength parameter if passed
      maxlength = args.delete(MAXLENGTH) if args[MAXLENGTH]
      filter = args.delete(FILTER) if args[FILTER]
      attributes = args[ATTRIBUTES]
      authorizations = args[AUTHORIZATIONS]
      consistency = args.delete(CONSISTENCY) if args[CONSISTENCY]
      replicaId = args.delete(REGION_REPLICA_ID) if args[REGION_REPLICA_ID]
      unless args.empty?
        columns = args[COLUMN] || args[COLUMNS]
        if args[VERSIONS]
          vers = args[VERSIONS]
        else
          vers = 1
        end
        if columns
          # Normalize types, convert string to an array of strings
          columns = [ columns ] if columns.is_a?(String)

          # At this point it is either an array or some unsupported stuff
          unless columns.kind_of?(Array)
            raise ArgumentError, "Failed parse column argument type #{args.inspect}, #{args.class}"
          end

          # Get each column name and add it to the filter
          columns.each do |column|
            family, qualifier = parse_column_name(column.to_s)
            if qualifier
              get.addColumn(family, qualifier)
            else
              get.addFamily(family)
            end
          end

          # Additional params
          get.setMaxVersions(vers)
          get.setTimeStamp(args[TIMESTAMP]) if args[TIMESTAMP]
          get.setTimeRange(args[TIMERANGE][0], args[TIMERANGE][1]) if args[TIMERANGE]
        else
          if attributes
          	 set_attributes(get, attributes)
          elsif authorizations
          	 set_authorizations(get, authorizations)
          else
          	# May have passed TIMESTAMP and row only; wants all columns from ts.
          	unless ts = args[TIMESTAMP] || tr = args[TIMERANGE]
            	raise ArgumentError, "Failed parse of #{args.inspect}, #{args.class}"
          	end
          end

          get.setMaxVersions(vers)
          # Set the timestamp/timerange
          get.setTimeStamp(ts.to_i) if args[TIMESTAMP]
          get.setTimeRange(args[TIMERANGE][0], args[TIMERANGE][1]) if args[TIMERANGE]
        end
        set_attributes(get, attributes) if attributes
        set_authorizations(get, authorizations) if authorizations
      end

      unless filter.class == String
        get.setFilter(filter)
      else
        get.setFilter(
          org.apache.hadoop.hbase.filter.ParseFilter.new.parseFilterString(filter.to_java_bytes))
      end

      get.setConsistency(org.apache.hadoop.hbase.client.Consistency.valueOf(consistency)) if consistency
      get.setReplicaId(replicaId) if replicaId

      # Call hbase for the results
      result = @table.get(get)
      return nil if result.isEmpty

      # Print out results.  Result can be Cell or RowResult.
      res = {}
      result.list.each do |kv|
        family = String.from_java_bytes(kv.getFamily)
        qualifier = org.apache.hadoop.hbase.util.Bytes::toStringBinary(kv.getQualifier)

        column = "#{family}:#{qualifier}"
        value = to_string(column, kv, maxlength)

        if block_given?
          yield(column, value)
        else
          res[column] = value
        end
      end

      # If block given, we've yielded all the results, otherwise just return them
      return ((block_given?) ? nil : res)
    end

    #----------------------------------------------------------------------------------------------
    # Fetches and decodes a counter value from hbase
    def _get_counter_internal(row, column)
      family, qualifier = parse_column_name(column.to_s)
      # Format get request
      get = org.apache.hadoop.hbase.client.Get.new(row.to_s.to_java_bytes)
      get.addColumn(family, qualifier)
      get.setMaxVersions(1)

      # Call hbase
      result = @table.get(get)
      return nil if result.isEmpty

      # Fetch cell value
      cell = result.list[0]
      org.apache.hadoop.hbase.util.Bytes::toLong(cell.getValue)
    end

    def _hash_to_scan(args)
      if args.any?
        enablemetrics = args["ALL_METRICS"].nil? ? false : args["ALL_METRICS"]
        enablemetrics = enablemetrics || !args["METRICS"].nil?
        filter = args["FILTER"]
        startrow = args["STARTROW"] || ''
        stoprow = args["STOPROW"]
        rowprefixfilter = args["ROWPREFIXFILTER"]
        timestamp = args["TIMESTAMP"]
        columns = args["COLUMNS"] || args["COLUMN"] || []
        # If CACHE_BLOCKS not set, then default 'true'.
        cache_blocks = args["CACHE_BLOCKS"].nil? ? true: args["CACHE_BLOCKS"]
        cache = args["CACHE"] || 0
        reversed = args["REVERSED"] || false
        versions = args["VERSIONS"] || 1
        timerange = args[TIMERANGE]
        raw = args["RAW"] || false
        attributes = args[ATTRIBUTES]
        authorizations = args[AUTHORIZATIONS]
        consistency = args[CONSISTENCY]
        # Normalize column names
        columns = [columns] if columns.class == String
        limit = args["LIMIT"] || -1
        unless columns.kind_of?(Array)
          raise ArgumentError.new("COLUMNS must be specified as a String or an Array")
        end

        scan = if stoprow
          org.apache.hadoop.hbase.client.Scan.new(startrow.to_java_bytes, stoprow.to_java_bytes)
        else
          org.apache.hadoop.hbase.client.Scan.new(startrow.to_java_bytes)
        end

        # This will overwrite any startrow/stoprow settings
        scan.setRowPrefixFilter(rowprefixfilter.to_java_bytes) if rowprefixfilter

        # Clear converters from last scan.
        @converters.clear()

        columns.each do |c|
          family, qualifier = parse_column_name(c.to_s)
          if qualifier
            scan.addColumn(family, qualifier)
          else
            scan.addFamily(family)
          end
        end

        unless filter.class == String
          scan.setFilter(filter)
        else
          scan.setFilter(
            org.apache.hadoop.hbase.filter.ParseFilter.new.parseFilterString(filter.to_java_bytes))
        end

        scan.setScanMetricsEnabled(enablemetrics) if enablemetrics
        scan.setTimeStamp(timestamp) if timestamp
        scan.setCacheBlocks(cache_blocks)
        scan.setReversed(reversed)
        scan.setCaching(cache) if cache > 0
        scan.setMaxVersions(versions) if versions > 1
        scan.setTimeRange(timerange[0], timerange[1]) if timerange
        scan.setRaw(raw)
        scan.setCaching(limit) if limit > 0
        set_attributes(scan, attributes) if attributes
        set_authorizations(scan, authorizations) if authorizations
        scan.setConsistency(org.apache.hadoop.hbase.client.Consistency.valueOf(consistency)) if consistency
      else
        scan = org.apache.hadoop.hbase.client.Scan.new
      end

      scan
    end

    def _get_scanner(args)
      @table.getScanner(_hash_to_scan(args))
    end

    #----------------------------------------------------------------------------------------------
    # Scans whole table or a range of keys and returns rows matching specific criteria
    def _scan_internal(args = {}, scan = nil)
      raise(ArgumentError, "Args should be a Hash") unless args.kind_of?(Hash)
      raise(ArgumentError, "Scan argument should be org.apache.hadoop.hbase.client.Scan") \
        unless scan == nil || scan.kind_of?(org.apache.hadoop.hbase.client.Scan)

      limit = args["LIMIT"] || -1
      maxlength = args.delete("MAXLENGTH") || -1
      count = 0
      res = {}

      # Start the scanner
      scan = scan == nil ? _hash_to_scan(args) : scan
      scanner = @table.getScanner(scan)
      iter = scanner.iterator

      # Iterate results
      while iter.hasNext
        row = iter.next
        key = org.apache.hadoop.hbase.util.Bytes::toStringBinary(row.getRow)

        row.list.each do |kv|
          family = String.from_java_bytes(kv.getFamily)
          qualifier = org.apache.hadoop.hbase.util.Bytes::toStringBinary(kv.getQualifier)

          column = "#{family}:#{qualifier}"
          cell = to_string(column, kv, maxlength)

          if block_given?
            yield(key, "column=#{column}, #{cell}")
          else
            res[key] ||= {}
            res[key][column] = cell
          end
        end

        # One more row processed
        count += 1
        if limit > 0 && count >= limit
          # If we reached the limit, exit before the next call to hasNext
          break
        end
      end

      scanner.close()
      return ((block_given?) ? count : res)
    end

     # Apply OperationAttributes to puts/scans/gets
    def set_attributes(oprattr, attributes)
      raise(ArgumentError, "Attributes must be a Hash type") unless attributes.kind_of?(Hash)
      for k,v in attributes
        v = v.to_s unless v.nil?
        oprattr.setAttribute(k.to_s, v.to_java_bytes)
      end
    end

    def set_cell_permissions(op, permissions)
      raise(ArgumentError, "Permissions must be a Hash type") unless permissions.kind_of?(Hash)
      map = java.util.HashMap.new
      permissions.each do |user,perms|
        map.put(user.to_s, org.apache.hadoop.hbase.security.access.Permission.new(
          perms.to_java_bytes))
      end
      op.setACL(map)
    end

    def set_cell_visibility(oprattr, visibility)
      oprattr.setCellVisibility(
        org.apache.hadoop.hbase.security.visibility.CellVisibility.new(
          visibility.to_s))
    end

    def set_authorizations(oprattr, authorizations)
      raise(ArgumentError, "Authorizations must be a Array type") unless authorizations.kind_of?(Array)
      auths = [ authorizations ].flatten.compact
      oprattr.setAuthorizations(
        org.apache.hadoop.hbase.security.visibility.Authorizations.new(
          auths.to_java(:string)))
    end

    def set_op_ttl(op, ttl)
      op.setTTL(ttl.to_java(:long))
    end

    #----------------------------
    # Add general administration utilities to the shell
    # each of the names below adds this method name to the table
    # by callling the corresponding method in the shell
    # Add single method utilities to the current class
    # Generally used for admin functions which just have one name and take the table name
    def self.add_admin_utils(*args)
      args.each do |method|
        define_method method do |*method_args|
          @shell.command(method, @name, *method_args)
        end
      end
    end

    #Add the following admin utilities to the table
    add_admin_utils :enable, :disable, :flush, :drop, :describe, :snapshot

    #----------------------------
    #give the general help for the table
    # or the named command
    def help (command = nil)
      #if there is a command, get the per-command help from the shell
      if command
        begin
          return @shell.help_command(command)
        rescue NoMethodError
          puts "Command \'#{command}\' does not exist. Please see general table help."
          return nil
        end
      end
      return @shell.help('table_help')
    end

    # Table to string
    def to_s
      cl = self.class()
      return "#{cl} - #{@name}"
    end

    # Standard ruby call to get the return value for an object
    # overriden here so we get sane semantics for printing a table on return
    def inspect
      to_s
    end

    #----------------------------------------------------------------------------------------
    # Helper methods

    # Returns a list of column names in the table
    def get_all_columns
      @table.table_descriptor.getFamilies.map do |family|
        "#{family.getNameAsString}:"
      end
    end

    # Checks if current table is one of the 'meta' tables
    def is_meta_table?
      org.apache.hadoop.hbase.TableName::META_TABLE_NAME.equals(@table.getName())
    end

    # Returns family and (when has it) qualifier for a column name
    def parse_column_name(column)
      split = org.apache.hadoop.hbase.KeyValue.parseColumn(column.to_java_bytes)
      set_converter(split) if split.length > 1
      return split[0], (split.length > 1) ? split[1] : nil
    end

    # Make a String of the passed kv
    # Intercept cells whose format we know such as the info:regioninfo in hbase:meta
    def to_string(column, kv, maxlength = -1)
      if is_meta_table?
        if column == 'info:regioninfo' or column == 'info:splitA' or column == 'info:splitB'
          hri = org.apache.hadoop.hbase.HRegionInfo.parseFromOrNull(kv.getValue)
          return "timestamp=%d, value=%s" % [kv.getTimestamp, hri.toString]
        end
        if column == 'info:serverstartcode'
          if kv.getValue.length > 0
            str_val = org.apache.hadoop.hbase.util.Bytes.toLong(kv.getValue)
          else
            str_val = org.apache.hadoop.hbase.util.Bytes.toStringBinary(kv.getValue)
          end
          return "timestamp=%d, value=%s" % [kv.getTimestamp, str_val]
        end
      end

      if kv.isDelete
        val = "timestamp=#{kv.getTimestamp}, type=#{org.apache.hadoop.hbase.KeyValue::Type::codeToType(kv.getType)}"
      else
        val = "timestamp=#{kv.getTimestamp}, value=#{convert(column, kv)}"
      end
      (maxlength != -1) ? val[0, maxlength] : val
    end

    def convert(column, kv)
      #use org.apache.hadoop.hbase.util.Bytes as the default class
      klazz_name = 'org.apache.hadoop.hbase.util.Bytes'
      #use org.apache.hadoop.hbase.util.Bytes::toStringBinary as the default convertor
      converter = 'toStringBinary'
      if @converters.has_key?(column)
        # lookup the CONVERTER for certain column - "cf:qualifier"
        matches = /c\((.+)\)\.(.+)/.match(@converters[column])
        if matches.nil?
          # cannot match the pattern of 'c(className).functionname'
          # use the default klazz_name
          converter = @converters[column]
        else
          klazz_name = matches[1]
          converter = matches[2]
        end
      end
      method = eval(klazz_name).method(converter)
      return method.call(kv.getValue) # apply the converter
    end

    # if the column spec contains CONVERTER information, to get rid of :CONVERTER info from column pair.
    # 1. return back normal column pair as usual, i.e., "cf:qualifier[:CONVERTER]" to "cf" and "qualifier" only
    # 2. register the CONVERTER information based on column spec - "cf:qualifier"
    def set_converter(column)
      family = String.from_java_bytes(column[0])
      parts = org.apache.hadoop.hbase.KeyValue.parseColumn(column[1])
      if parts.length > 1
        @converters["#{family}:#{String.from_java_bytes(parts[0])}"] = String.from_java_bytes(parts[1])
        column[1] = parts[0]
      end
    end

    #----------------------------------------------------------------------------------------------
    # Get the split points for the table
    def _get_splits_internal()
      locator = @table.getRegionLocator()
      splits = locator.getAllRegionLocations().
          map{|i| Bytes.toStringBinary(i.getRegionInfo().getStartKey)}.delete_if{|k| k == ""}
      locator.close()
      puts("Total number of splits = %s" % [splits.size + 1])
      return splits
    end
  end
end