org.apache.tika.parser.recognition.tf.inceptionapi.py Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
#!/usr/bin/env python
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#    http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

"""
    Image classification with Inception.

    This script exposes the tensorflow's inception classification service over REST API.

    For more details, visit:
        https://tensorflow.org/tutorials/image_recognition/

    Requirements :
      Flask
      tensorflow
      numpy
      requests
      pillow
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import tempfile
import json
import logging
import requests

from flask import Flask, request, Response, jsonify
from io import BytesIO
from logging.handlers import RotatingFileHandler
from PIL import Image
from time import time

import tensorflow as tf

from inception_v4 import default_image_size, inception_v4_arg_scope, inception_v4

try:
    # This import is placed inside here to ensure that video_util and OpenCV is not required for image recognition APIs
    from video_util import get_center_frame, get_frames_interval, get_n_frames
except:
    print("Can't import video libraries, No video functionality is available")

json.encoder.FLOAT_REPR = lambda o: format(o, '.2f')  # JSON serialization of floats
slim = tf.contrib.slim
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('model_dir',
                           '/usr/share/apache-tika/models/dl/image-video/recognition/',
                           """Path to inception_v4.ckpt & meta files""")
tf.app.flags.DEFINE_integer('port',
                            '8764',
                            """Server PORT, default:8764""")
tf.app.flags.DEFINE_string('log',
                           'inception.log',
                           """Log file name, default: inception.log""")


def preprocess_image(image, height, width, central_fraction=0.875, scope=None):
    """Prepare one image for evaluation.
    If height and width are specified it would output an image with that size by
    applying resize_bilinear.
    If central_fraction is specified it would crop the central fraction of the
    input image.
    Args:
      image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
        [0, 1], otherwise it would converted to tf.float32 assuming that the range
        is [0, MAX], where MAX is largest positive representable number for
        int(8/16/32) data type (see `tf.image.convert_image_dtype` for details).
      height: integer
      width: integer
      central_fraction: Optional Float, fraction of the image to crop.
      scope: Optional scope for name_scope.
    Returns:
      3-D float Tensor of prepared image.
    """
    with tf.name_scope(scope, 'eval_image', [image, height, width]):
        if image.dtype != tf.float32:
            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        # Crop the central region of the image with an area containing 87.5% of
        # the original image.
        if central_fraction:
            image = tf.image.central_crop(image, central_fraction=central_fraction)

        if height and width:
            # Resize the image to the specified height and width.
            image = tf.expand_dims(image, 0)
            image = tf.image.resize_bilinear(image, [height, width],
                                             align_corners=False)
            image = tf.squeeze(image, [0])
        image = tf.subtract(image, 0.5)
        image = tf.multiply(image, 2.0)
        return image


def create_readable_names_for_imagenet_labels():
    """
        Create a dict mapping label id to human readable string.
        Returns:
            labels_to_names: dictionary where keys are integers from to 1000
            and values are human-readable names.

        We retrieve a synset file, which contains a list of valid synset labels used
        by ILSVRC competition. There is one synset one per line, eg.
                #   n01440764
                #   n01443537
        We also retrieve a synset_to_human_file, which contains a mapping from synsets
        to human-readable names for every synset in Imagenet. These are stored in a
        tsv format, as follows:
                #   n02119247    black fox
                #   n02119359    silver fox
        We assign each synset (in alphabetical order) an integer, starting from 1
        (since 0 is reserved for the background class).

        Code is based on
        https://github.com/tensorflow/models/blob/master/inception/inception/data/build_imagenet_data.py
    """

    dest_directory = FLAGS.model_dir

    synset_list = [s.strip() for s in open(os.path.join(dest_directory, 'imagenet_lsvrc_2015_synsets.txt')).readlines()]
    num_synsets_in_ilsvrc = len(synset_list)
    assert num_synsets_in_ilsvrc == 1000

    synset_to_human_list = open(os.path.join(dest_directory, 'imagenet_metadata.txt')).readlines()
    num_synsets_in_all_imagenet = len(synset_to_human_list)
    assert num_synsets_in_all_imagenet == 21842

    synset_to_human = {}
    for s in synset_to_human_list:
        parts = s.strip().split('\t')
        assert len(parts) == 2
        synset = parts[0]
        human = parts[1]
        synset_to_human[synset] = human

    label_index = 1
    labels_to_names = {0: 'background'}
    for synset in synset_list:
        name = synset_to_human[synset]
        labels_to_names[label_index] = name
        label_index += 1

    return labels_to_names


def get_remote_file(url, success=200, timeout=10):
    """
        Given HTTP URL, this api gets the content of it
        returns (Content-Type, image_content)
    """
    try:
        app.logger.info("GET: %s" % url)
        auth = None
        res = requests.get(url, stream=True, timeout=timeout, auth=auth)
        if res.status_code == success:
            return res.headers.get('Content-Type', 'application/octet-stream'), res.raw.data
    except:
        pass
    return None, None


def current_time():
    """Returns current time in milli seconds"""

    return int(1000 * time())


class Classifier(Flask):
    """Classifier Service class"""

    def __init__(self, name):
        super(Classifier, self).__init__(name)
        file_handler = RotatingFileHandler(FLAGS.log, maxBytes=1024 * 1024 * 100, backupCount=20)
        file_handler.setLevel(logging.INFO)
        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        file_handler.setFormatter(formatter)
        self.logger.addHandler(file_handler)
        self.names = create_readable_names_for_imagenet_labels()
        self.image_size = default_image_size

        self.image_str_placeholder = tf.placeholder(tf.string)
        image = tf.image.decode_jpeg(self.image_str_placeholder, channels=3)
        processed_image = preprocess_image(image, self.image_size, self.image_size)
        processed_images = tf.expand_dims(processed_image, 0)
        # create the model, use the default arg scope to configure the batch norm parameters.
        with slim.arg_scope(inception_v4_arg_scope()):
            logits, _ = inception_v4(processed_images, num_classes=1001, is_training=False)
        self.probabilities = tf.nn.softmax(logits)

        dest_directory = FLAGS.model_dir
        init_fn = slim.assign_from_checkpoint_fn(
            os.path.join(dest_directory, 'inception_v4.ckpt'),
            slim.get_model_variables('InceptionV4'))

        self.sess = tf.Session()
        init_fn(self.sess)

    def classify(self, image_string, topn, min_confidence):
        eval_probabilities = self.sess.run(self.probabilities, feed_dict={self.image_str_placeholder: image_string})
        eval_probabilities = eval_probabilities[0, 0:]
        sorted_inds = [i[0] for i in sorted(enumerate(-eval_probabilities), key=lambda x: x[1])]

        if topn is None:
            topn = len(sorted_inds)

        res = []
        for i in range(topn):
            index = sorted_inds[i]
            score = float(eval_probabilities[index])
            if min_confidence is None:
                res.append((index, self.names[index], score))
            else:
                if score >= min_confidence:
                    res.append((index, self.names[index], score))
                else:
                    # the scores are in sorted order, so we can break the loop whenever we get a low score object
                    break
        return res


app = Classifier(__name__)


@app.route("/")
def index():
    """The index page which provide information about other API end points"""

    return """
    
     Inception REST API 
     The following API end points are valid 
        
             Inception V4 
             /inception/v4/ping  - 

                 Description :  checks availability of the service. returns "pong" with status 200 when it is available
            
             /inception/v4/classify/image - 

                
                 Description  This is a classifier service that can classify images
                 Query Params : 

                   topn : type = int : top classes to get; default : 5 

                   min_confidence : type = float : minimum confidence that a label should have to exist in topn; default : 0.015 

                   human : type = boolean : human readable class names; default : true 

                 
                 How to supply Image Content 
                 With HTTP GET :  
                    Include a query parameter url  which is an http url of JPEG image 

                    Example:  curl "localhost:8764/inception/v4/classify/image?url=http://xyz.com/example.jpg"
                
                 With HTTP POST : 
                    POST JPEG image content as binary data in request body. 

                    Example:  curl -X POST "localhost:8764/inception/v4/classify/image?topn=5&min_confidence=0.015&human=false" --data-binary @example.jpg 
                
                
            
             /inception/v4/classify/video - 

                

                 Description  This is a classifier service that can classify videos
                 Query Params : 

                   topn : type = int : top classes to get; default : 5 

                   min_confidence : type = float : minimum confidence that a label should have to exist in topn; default : 0.015 

                   human : type = boolean : human readable class names; default : true 

                   mode : options = {"center", "interval", "fixed"} : Modes of frame extraction; default : center 

                      "center" - Just one frame in center. 

                      "interval" - Extracts frames after fixed interval. 

                      "fixed" - Extract fixed number of frames.

                   frame-interval : type = int : Interval for frame extraction to be used with INTERVAL mode. If frame_interval=10 then every 10th frame will be extracted; default : 10 

                   num-frame : type = int : Number of frames to be extracted from video while using FIXED model. If num_frame=10 then 10 frames equally distant from each other will be extracted; default : 10 


                 
                 How to supply Video Content 
                 With HTTP GET :  
                    Include a query parameter url  which is path on file system 

                    Example:  curl "localhost:8764/inception/v4/classify/video?url=filesystem/path/to/video"

                
                 With HTTP POST : 
                    POST video content as binary data in request body. If video can be decoded by OpenCV it should be fine. It's tested on mp4 and avi on mac 

                    Include a query parameter ext this extension is needed to tell OpenCV which decoder to use, default is ".mp4" 

                    Example:  curl -X POST "localhost:8764/inception/v4/classify/video?topn=5&min_confidence=0.015&human=false" --data-binary @example.mp4 
                
                
            
        
    
    """


@app.route("/inception/v4/ping", methods=["GET"])
def ping_pong():
    """API to do health check. If this says status code 200, then healthy"""

    return "pong"


@app.route("/inception/v4/classify/image", methods=["GET", "POST"])
def classify_image():
    """API to classify images"""

    image_format = "not jpeg"

    st = current_time()
    topn = int(request.args.get("topn", "5"))
    min_confidence = float(request.args.get("min_confidence", "0.015"))
    human = request.args.get("human", "true").lower() in ("true", "1", "yes")
    if request.method == 'POST':
        image_data = request.get_data()
    else:
        url = request.args.get("url")
        c_type, image_data = get_remote_file(url)
        if not image_data:
            return Response(status=400, response=jsonify(error="Could not HTTP GET %s" % url))
        if 'image/jpeg' in c_type:
            image_format = "jpeg"

    # use c_type to find whether image_format is jpeg or not
    # if jpeg, don't convert
    if image_format == "jpeg":
        jpg_image = image_data
    # if not jpeg
    else:
        # open the image from raw bytes
        image = Image.open(BytesIO(image_data))
        # convert the image to RGB format, otherwise will give errors when converting to jpeg, if the image isn't RGB
        rgb_image = image.convert("RGB")
        # convert the RGB image to jpeg
        image_bytes = BytesIO()
        rgb_image.save(image_bytes, format="jpeg", quality=95)
        jpg_image = image_bytes.getvalue()
        image_bytes.close()

    read_time = current_time() - st
    st = current_time()  # reset start time
    try:
        classes = app.classify(image_string=jpg_image, topn=topn, min_confidence=min_confidence)
    except Exception as e:
        app.logger.error(e)
        return Response(status=400, response=str(e))
    classids, classnames, confidence = zip(*classes)

    print(classnames, confidence)

    classifier_time = current_time() - st
    app.logger.info("Classifier time : %d" % classifier_time)
    res = {
        'classids': classids,
        'confidence': confidence,
        'time': {
            'read': read_time,
            'classification': classifier_time,
            'units': 'ms'
        }
    }
    if human:
        res['classnames'] = classnames
    return Response(response=json.dumps(res), status=200, mimetype="application/json")


@app.route("/inception/v4/classify/video", methods=["GET", "POST"])
def classify_video():
    """
        API to classify videos
        Request args -
         url - PATH of file
         topn - number of top scoring labels
         min_confidence - minimum confidence that a label should have to exist in topn
         human - human readable or not
         mode - Modes of frame extraction {"center", "interval", "fixed"}
            "center" - Just one frame in center. 
            "interval" - Extracts frames after fixed interval.
            "fixed" - Extract fixed number of frames.
         frame-interval - Interval for frame extraction to be used with INTERVAL mode. If frame_interval=10 then every 10th frame will be extracted.
         num-frame - Number of frames to be extracted from video while using FIXED model. If num_frame=10 then 10 frames equally distant from each other will be extracted

         ext - If video is sent in binary format, then ext is needed to tell OpenCV which decoder to use. eg ".mp4"
    """

    st = current_time()
    topn = int(request.args.get("topn", "5"))
    min_confidence = float(request.args.get("min_confidence", "0.015"))
    human = request.args.get("human", "true").lower() in ("true", "1", "yes")

    mode = request.args.get("mode", "center").lower()
    if mode not in {"center", "interval", "fixed"}:
        '''
        Throw invalid request error
        '''
        return Response(status=400, response=jsonify(error="not a valid mode. Available mode %s" % str(ALLOWED_MODE)))

    frame_interval = int(request.args.get("frame-interval", "10"))
    num_frame = int(request.args.get("num-frame", "10"))

    if request.method == 'POST':
        video_data = request.get_data()
        ext = request.args.get("ext", ".mp4").lower()

        temp_file = tempfile.NamedTemporaryFile(suffix=ext)
        temp_file.file.write(video_data)
        temp_file.file.close()

        url = temp_file.name
    else:
        url = request.args.get("url")

    read_time = current_time() - st
    st = current_time()  # reset start time

    if mode == "center":
        image_data_arr = [get_center_frame(url)]
    elif mode == "interval":
        image_data_arr = get_frames_interval(url, frame_interval)
    else:
        image_data_arr = get_n_frames(url, num_frame)

    classes = []
    for image_data in image_data_arr:
        try:
            _classes = app.classify(image_data, topn=None, min_confidence=None)
        except Exception as e:
            app.logger.error(e)
            return Response(status=400, response=str(e))

        _classes.sort()
        if len(classes) == 0:
            classes = _classes
        else:
            for idx, _c in enumerate(_classes):
                c = list(classes[idx])
                c[2] += _c[2]
                classes[idx] = tuple(c)

    top_classes = []
    for c in classes:
        c = list(c)
        # avg out confidence score
        avg_score = c[2] / len(image_data_arr)
        c[2] = avg_score
        if avg_score >= min_confidence:
            top_classes.append(tuple(c))

    top_classes = sorted(top_classes, key=lambda tup: tup[2])[-topn:][::-1]

    classids, classnames, confidence = zip(*top_classes)

    classifier_time = current_time() - st
    app.logger.info("Classifier time : %d" % classifier_time)
    res = {
        'classids': classids,
        'confidence': confidence,
        'time': {
            'read': read_time,
            'classification': classifier_time,
            'units': 'ms'
        }
    }
    if human:
        res['classnames'] = classnames
    return Response(response=json.dumps(res), status=200, mimetype="application/json")


def main(_):
    if not app.debug:
        print("Serving on port %d" % FLAGS.port)
    app.run(host="0.0.0.0", port=FLAGS.port)


if __name__ == '__main__':
    tf.app.run()
How to supply Image Content
Description	This is a classifier service that can classify images
	Query Params : `topn` : type = int : top classes to get; default : 5 `min_confidence` : type = float : minimum confidence that a label should have to exist in topn; default : 0.015 `human` : type = boolean : human readable class names; default : true
With HTTP GET :	Include a query parameter `url` which is an http url of JPEG image Example: `curl "localhost:8764/inception/v4/classify/image?url=http://xyz.com/example.jpg"`
With HTTP POST :	POST JPEG image content as binary data in request body. Example: `curl -X POST "localhost:8764/inception/v4/classify/image?topn=5&min_confidence=0.015&human=false" --data-binary @example.jpg`