All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencastproject.textanalyzer.impl.TextAnalyzerServiceImpl Maven / Gradle / Ivy

There is a newer version: 16.7
Show newest version
/*
 * Licensed to The Apereo Foundation under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 *
 * The Apereo Foundation licenses this file to you under the Educational
 * Community License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License
 * at:
 *
 *   http://opensource.org/licenses/ecl2.txt
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 */

package org.opencastproject.textanalyzer.impl;

import org.opencastproject.dictionary.api.DictionaryService;
import org.opencastproject.job.api.AbstractJobProducer;
import org.opencastproject.job.api.Job;
import org.opencastproject.mediapackage.Attachment;
import org.opencastproject.mediapackage.Catalog;
import org.opencastproject.mediapackage.MediaPackageElementBuilderFactory;
import org.opencastproject.mediapackage.MediaPackageElementParser;
import org.opencastproject.mediapackage.MediaPackageElements;
import org.opencastproject.mediapackage.MediaPackageException;
import org.opencastproject.metadata.mpeg7.MediaTime;
import org.opencastproject.metadata.mpeg7.MediaTimeImpl;
import org.opencastproject.metadata.mpeg7.Mpeg7CatalogImpl;
import org.opencastproject.metadata.mpeg7.Mpeg7CatalogService;
import org.opencastproject.metadata.mpeg7.SpatioTemporalDecomposition;
import org.opencastproject.metadata.mpeg7.TemporalDecomposition;
import org.opencastproject.metadata.mpeg7.Textual;
import org.opencastproject.metadata.mpeg7.Video;
import org.opencastproject.metadata.mpeg7.VideoSegment;
import org.opencastproject.metadata.mpeg7.VideoText;
import org.opencastproject.metadata.mpeg7.VideoTextImpl;
import org.opencastproject.security.api.OrganizationDirectoryService;
import org.opencastproject.security.api.SecurityService;
import org.opencastproject.security.api.UserDirectoryService;
import org.opencastproject.serviceregistry.api.ServiceRegistry;
import org.opencastproject.serviceregistry.api.ServiceRegistryException;
import org.opencastproject.textanalyzer.api.TextAnalyzerException;
import org.opencastproject.textanalyzer.api.TextAnalyzerService;
import org.opencastproject.textextractor.api.TextExtractor;
import org.opencastproject.textextractor.api.TextExtractorException;
import org.opencastproject.util.LoadUtil;
import org.opencastproject.util.NotFoundException;
import org.opencastproject.util.ReadinessIndicator;
import org.opencastproject.workspace.api.Workspace;

import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.cm.ManagedService;
import org.osgi.service.component.ComponentContext;
import org.osgi.service.component.annotations.Activate;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.Reference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Dictionary;
import java.util.List;

/**
 * Media analysis service that takes takes an image and returns text as extracted from that image.
 */
@Component(
    immediate = true,
    service = { TextAnalyzerService.class,ManagedService.class },
    property = {
        "service.description=Text Analysis Service",
        "service.pid=org.opencastproject.textanalyzer.impl.TextAnalyzerServiceImpl"
    }
)
public class TextAnalyzerServiceImpl extends AbstractJobProducer implements TextAnalyzerService, ManagedService {

  /** The logging facility */
  private static final Logger logger = LoggerFactory.getLogger(TextAnalyzerServiceImpl.class);

  /** List of available operations on jobs */
  private enum Operation {
    Extract
  };

  /** Resulting collection in the working file repository */
  public static final String COLLECTION_ID = "ocrtext";

  /** The approximate load placed on the system by creating a text analysis job */
  public static final float DEFAULT_ANALYSIS_JOB_LOAD = 0.2f;

  /** The key to look for in the service configuration file to override the {@link #DEFAULT_ANALYSIS_JOB_LOAD} */
  public static final String ANALYSIS_JOB_LOAD_KEY = "job.load.analysis";

  /** The approximate load placed on the system by creating a text analysis job */
  private float analysisJobLoad = DEFAULT_ANALYSIS_JOB_LOAD;

  /** The text extraction implemenetation */
  private TextExtractor textExtractor = null;

  /** Reference to the receipt service */
  private ServiceRegistry serviceRegistry = null;

  /** The workspace to ue when retrieving remote media files */
  private Workspace workspace = null;

  /** The mpeg-7 service */
  protected Mpeg7CatalogService mpeg7CatalogService;

  /** The dictionary service */
  protected DictionaryService dictionaryService;

  /** The security service */
  protected SecurityService securityService = null;

  /** The user directory service */
  protected UserDirectoryService userDirectoryService = null;

  /** The organization directory service */
  protected OrganizationDirectoryService organizationDirectoryService = null;

  /**
   * Creates a new instance of the text analyzer service.
   */
  public TextAnalyzerServiceImpl() {
    super(JOB_TYPE);
  }

  /**
   * OSGi callback on component activation.
   *
   * @param cc
   *          the component context
   */
  @Override
  @Activate
  public void activate(ComponentContext cc) {
    logger.info("Activating Text analyser service");
    super.activate(cc);
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.textanalyzer.api.TextAnalyzerService#extract(org.opencastproject.mediapackage.Attachment)
   */
  @Override
  public Job extract(Attachment image) throws TextAnalyzerException, MediaPackageException {
    try {
      return serviceRegistry.createJob(JOB_TYPE, Operation.Extract.toString(),
              Arrays.asList(MediaPackageElementParser.getAsXml(image)), analysisJobLoad);
    } catch (ServiceRegistryException e) {
      throw new TextAnalyzerException("Unable to create job", e);
    }
  }

  /**
   * Starts text extraction on the image and returns a receipt containing the final result in the form of an
   * Mpeg7Catalog.
   *
   * @param image
   *          the element to analyze
   * @param block
   *          true to make this operation synchronous
   * @return a receipt containing the resulting mpeg-7 catalog
   * @throws TextAnalyzerException
   */
  private Catalog extract(Job job, Attachment image) throws TextAnalyzerException, MediaPackageException {

    final Attachment attachment = image;
    final URI imageUrl = attachment.getURI();

    File imageFile = null;
    try {
      Mpeg7CatalogImpl mpeg7 = Mpeg7CatalogImpl.newInstance();

      logger.info("Starting text extraction from {}", imageUrl);
      try {
        imageFile = workspace.get(imageUrl);
      } catch (NotFoundException e) {
        throw new TextAnalyzerException("Image " + imageUrl + " not found in workspace", e);
      } catch (IOException e) {
        throw new TextAnalyzerException("Unable to access " + imageUrl + " in workspace", e);
      }
      VideoText[] videoTexts = analyze(imageFile, image.getIdentifier());

      // Create a temporal decomposition
      MediaTime mediaTime = new MediaTimeImpl(0, 0);
      Video avContent = mpeg7.addVideoContent(image.getIdentifier(), mediaTime, null);
      TemporalDecomposition temporalDecomposition = (TemporalDecomposition) avContent
              .getTemporalDecomposition();

      // Add a segment
      VideoSegment videoSegment = temporalDecomposition.createSegment("segment-0");
      videoSegment.setMediaTime(mediaTime);

      // Add the video text to the spacio temporal decomposition of the segment
      SpatioTemporalDecomposition spatioTemporalDecomposition = videoSegment.createSpatioTemporalDecomposition(true,
              false);
      for (VideoText videoText : videoTexts) {
        spatioTemporalDecomposition.addVideoText(videoText);
      }

      logger.info("Text extraction of {} finished, {} lines found", attachment.getURI(), videoTexts.length);

      URI uri;
      InputStream in;
      try {
        in = mpeg7CatalogService.serialize(mpeg7);
      } catch (IOException e) {
        throw new TextAnalyzerException("Error serializing mpeg7", e);
      }
      try {
        uri = workspace.putInCollection(COLLECTION_ID, job.getId() + ".xml", in);
      } catch (IOException e) {
        throw new TextAnalyzerException("Unable to put mpeg7 into the workspace", e);
      }
      Catalog catalog = (Catalog) MediaPackageElementBuilderFactory.newInstance().newElementBuilder()
              .newElement(Catalog.TYPE, MediaPackageElements.TEXTS);
      catalog.setURI(uri);

      logger.debug("Created MPEG7 catalog for {}", imageUrl);

      return catalog;
    } catch (Exception e) {
      logger.warn("Error extracting text from " + imageUrl, e);
      if (e instanceof TextAnalyzerException) {
        throw (TextAnalyzerException) e;
      } else {
        throw new TextAnalyzerException(e);
      }
    } finally {
      try {
        workspace.delete(imageUrl);
      } catch (Exception e) {
        logger.warn("Unable to delete temporary text analysis image {}: {}", imageUrl, e);
      }
    }
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.job.api.AbstractJobProducer#process(org.opencastproject.job.api.Job)
   */
  @Override
  protected String process(Job job) throws Exception {
    Operation op = null;
    String operation = job.getOperation();
    List arguments = job.getArguments();
    try {
      op = Operation.valueOf(operation);
      switch (op) {
        case Extract:
          Attachment element = (Attachment) MediaPackageElementParser.getFromXml(arguments.get(0));
          Catalog catalog = extract(job, element);
          return MediaPackageElementParser.getAsXml(catalog);
        default:
          throw new IllegalStateException("Don't know how to handle operation '" + operation + "'");
      }
    } catch (IllegalArgumentException e) {
      throw new ServiceRegistryException("This service can't handle operations of type '" + op + "'", e);
    } catch (IndexOutOfBoundsException e) {
      throw new ServiceRegistryException("This argument list for operation '" + op + "' does not meet expectations", e);
    } catch (Exception e) {
      throw new ServiceRegistryException("Error handling operation '" + op + "'", e);
    }
  }

  /**
   * Returns the video text element for the given image.
   *
   * @param imageFile
   *          the image
   * @param id
   *          the video text id
   * @return the video text found on the image
   * @throws TextAnalyzerException
   *           if accessing the image fails
   */
  protected VideoText[] analyze(File imageFile, String id) throws TextAnalyzerException {

    /* Call the text extractor implementation to extract the text from the
     * provided image file */
    List videoTexts = new ArrayList();
    List extractedText;
    try {
      extractedText = textExtractor.extract(imageFile);
    } catch (IOException | TextExtractorException e) {
      logger.warn("Error extracting text from {}", imageFile, e);
      throw new TextAnalyzerException(e);
    }

    /* Get detected text as raw string */
    int i = 1;
    for (String line : extractedText) {
      VideoText videoText = new VideoTextImpl(id + "-" + i++);
      Textual text = dictionaryService.cleanUpText(line);
      if (text != null) {
        videoText.setText(text);
        videoTexts.add(videoText);
      }
    }


    return videoTexts.toArray(new VideoText[0]);
  }

  /**
   * Sets the receipt service
   *
   * @param serviceRegistry
   *          the service registry
   */
  @Reference
  protected void setServiceRegistry(ServiceRegistry serviceRegistry) {
    this.serviceRegistry = serviceRegistry;
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.job.api.AbstractJobProducer#getServiceRegistry()
   */
  @Override
  protected ServiceRegistry getServiceRegistry() {
    return serviceRegistry;
  }

  /**
   * Sets the text extractor.
   *
   * @param textExtractor
   *          a text extractor implementation
   */
  @Reference
  protected void setTextExtractor(TextExtractor textExtractor) {
    this.textExtractor = textExtractor;
  }

  /**
   * Sets the workspace
   *
   * @param workspace
   *          an instance of the workspace
   */
  @Reference
  protected void setWorkspace(Workspace workspace) {
    this.workspace = workspace;
  }

  /**
   * Sets the mpeg7CatalogService
   *
   * @param mpeg7CatalogService
   *          an instance of the mpeg7 catalog service
   */
  @Reference(name = "mpeg7service")
  protected void setMpeg7CatalogService(Mpeg7CatalogService mpeg7CatalogService) {
    this.mpeg7CatalogService = mpeg7CatalogService;
  }

  /**
   * Sets the dictionary service
   *
   * @param dictionaryService
   *          an instance of the dicitonary service
   */
  @Reference
  protected void setDictionaryService(DictionaryService dictionaryService) {
    this.dictionaryService = dictionaryService;
  }

  /**
   * Callback for setting the security service.
   *
   * @param securityService
   *          the securityService to set
   */
  @Reference
  public void setSecurityService(SecurityService securityService) {
    this.securityService = securityService;
  }

  /**
   * Callback for setting the user directory service.
   *
   * @param userDirectoryService
   *          the userDirectoryService to set
   */
  @Reference
  public void setUserDirectoryService(UserDirectoryService userDirectoryService) {
    this.userDirectoryService = userDirectoryService;
  }

  /**
   * Sets a reference to the organization directory service.
   *
   * @param organizationDirectory
   *          the organization directory
   */
  @Reference
  public void setOrganizationDirectoryService(OrganizationDirectoryService organizationDirectory) {
    this.organizationDirectoryService = organizationDirectory;
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.job.api.AbstractJobProducer#getSecurityService()
   */
  @Override
  protected SecurityService getSecurityService() {
    return securityService;
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.job.api.AbstractJobProducer#getUserDirectoryService()
   */
  @Override
  protected UserDirectoryService getUserDirectoryService() {
    return userDirectoryService;
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.job.api.AbstractJobProducer#getOrganizationDirectoryService()
   */
  @Override
  protected OrganizationDirectoryService getOrganizationDirectoryService() {
    return organizationDirectoryService;
  }

  @Override
  public void updated(@SuppressWarnings("rawtypes") Dictionary properties) throws ConfigurationException {
    analysisJobLoad = LoadUtil.getConfiguredLoadValue(properties, ANALYSIS_JOB_LOAD_KEY, DEFAULT_ANALYSIS_JOB_LOAD,
            serviceRegistry);
  }

  @Reference(target = "(artifact=dictionary)")
  public void setReadinessIndicator(ReadinessIndicator readinessIndicator) {
    //Only activate service if ReadinessIndicator is registered.
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy