All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.gagravarr.tika.OggDetector Maven / Gradle / Ivy

There is a newer version: 0.8
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gagravarr.tika;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.gagravarr.ogg.OggFile;
import org.gagravarr.ogg.OggPacket;
import org.gagravarr.ogg.OggPacketReader;
import org.gagravarr.ogg.OggStreamIdentifier;
import org.gagravarr.ogg.OggStreamIdentifier.OggStreamType;
import org.gagravarr.skeleton.SkeletonPacket;
import org.gagravarr.skeleton.SkeletonPacketFactory;

/**
 * Detector for identifying specific file types stored
 *  within an Ogg container.
 * Xiph provide a fairly unhelpful guide to mimetypes at
 *  https://wiki.xiph.org/index.php/MIME_Types_and_File_Extensions
 *  but we try to use more specific ones, as given by the Tika
 *  mimetypes xml file.
 */
public class OggDetector implements Detector {
   private static final long serialVersionUID = 591382028699008553L;

   public static final MediaType OGG_GENERAL = MediaType.application("ogg");
   protected static final MediaType OGG_AUDIO = MediaType.audio("ogg");
   protected static final MediaType OGG_VIDEO = MediaType.video("ogg");

   public MediaType detect(InputStream input, Metadata metadata)
         throws IOException {
      // Check if we have access to the document
      if (input == null) {
          return MediaType.OCTET_STREAM;
      }

      // Ensure we can mark and reset before detecting
      // Otherwise bail, to avoid us nibbling an unwindable stream
      if (!input.markSupported()) {
          return MediaType.OCTET_STREAM;
      }

      // Check if the document starts with the OGG header
      input.mark(4);
      try {
          if (input.read() != (byte)'O' || input.read() != (byte)'g'
                  || input.read() != (byte)'g' || input.read() != (byte)'S') {
              return MediaType.OCTET_STREAM;
          }
      } finally {
          input.reset();
      }

      // We can only detect the exact type when given a TikaInputStream
      TikaInputStream tis = TikaInputStream.cast(input);
      if (tis != null) {
         // We could potentially need to go a long way through the
         //  file in order to figure out what it is
         tis.mark((int)tis.getLength()+1);

         OggFile ogg = new OggFile(tis);

         // The things we'll want to keep track of
         int totalStreams = 0;
         List sids = new ArrayList();
         Map streams = 
                 new HashMap();
         Map> skeletonStreams =
                 new HashMap>();

         // Check the streams in turn
         OggPacketReader r = ogg.getPacketReader();
         OggPacket p;
         Integer sid;
         while( (p = r.getNextPacket()) != null ) {
            if(p.isBeginningOfStream()) {
               totalStreams++;
               sids.add(p.getSid());

               OggStreamType type = OggStreamIdentifier.identifyType(p);

               // If it's a Skeleton stream, start tracking
               if (type == OggStreamIdentifier.SKELETON) {
                   List sp = new ArrayList();
                   sp.add(SkeletonPacketFactory.create(p));
                   skeletonStreams.put(p.getSid(), sp);
               }

               // Increment the per-type count
               Integer num = streams.get(type);
               if (num == null) {
                   num = 1;
               } else {
                   num = num + 1;
               }
               streams.put(type, num);
            } else {
                sid = p.getSid();

                // Is it a skeleton stream?
                if (skeletonStreams.containsKey(sid)) {
                    skeletonStreams.get(sid).add(SkeletonPacketFactory.create(p));
                } else {
                    // We don't worry about later packets in non-skeleton
                    // streams at this stage, only parsers mind about them
                }
            }
         }

         // Tidy
         tis.reset();

         // TODO See if we found any of the Ogg Metadata streams,
         //  eg Ogg Skeleton / Annodex or CMML, and if so use them
         //  to help identify the relationship between the different
         //  streams and hence the type

         // Can we identify what it really is?
         // First up, is it a simple single stream file?
         if (totalStreams == 1) {
             OggStreamType type = streams.keySet().iterator().next();
             return toMediaType(type);
         }

         // Is it one with a single non-metadata stream?
         int nonMetadataStreams = 0;
         for (OggStreamType type : streams.keySet()) {
             if (type.kind != OggStreamType.Kind.METADATA) {
                 nonMetadataStreams += streams.get(type);
             }
         }

         if (nonMetadataStreams == 0) {
             // Pure metadata, report as general ogg
             return OGG_GENERAL;
         }

         if (nonMetadataStreams == 1) {
             // Report as the non metadata kind
             for (OggStreamType type : streams.keySet()) {
                 if (type.kind != OggStreamType.Kind.METADATA) {
                     return toMediaType(type);
                 }
             }
         }


         // Is it a single video stream, with zero or more audio streams?
         int videoCount = 0;
         int audioCount = 0;
         Set audioTypes = new HashSet();
         Set videoTypes = new HashSet();
         for (OggStreamType type : streams.keySet()) {
             if (type.kind == OggStreamType.Kind.VIDEO) {
                 videoCount += streams.get(type);
                 videoTypes.add(type);
             }
             if (type.kind == OggStreamType.Kind.AUDIO) {
                 audioCount += streams.get(type);
                 audioTypes.add(type);
             }
         }
         if (videoCount == 1) {
             // Report it as the video type, not the audio within that
             return toMediaType( videoTypes.iterator().next() );
         }


         // Is it multiple audio streams, with no video?
         if (videoCount == 0 && audioCount > 1) {
             // Are they all the same audio kind?
             if (audioTypes.size() == 1) {
                 // All the same kind, report it as that
                 return toMediaType( audioTypes.iterator().next() );
             } else {
                 // Mixture of audio types, report as general audio
                 return OGG_AUDIO;
             }
         }

         // Is it multiple video streams?
         if (videoCount > 1) {
             // Are they all the same video kind?
             if (videoTypes.size() == 1) {
                 // All the same kind, report it as that video
                 // (Don't mention the audio in it, if any)
                 return toMediaType( videoTypes.iterator().next() );
             } else {
                 // Mixture of video types, possibly with audio, report
                 // it as general video as the best we can do
                 return OGG_VIDEO;
             }
         }

         // If we get here, then we can't work out what it is
      }

      // Couldn't determine a more specific type
      return OGG_GENERAL;
   }

   /**
    * Converts from our type to Tika's type
    */
   protected static MediaType toMediaType(OggStreamType type) {
       if (type == OggStreamIdentifier.UNKNOWN) {
           // We don't have a specific type available to return
           return OGG_GENERAL;
       } else {
           // Say it's the specific type we found
           return MediaType.parse(type.mimetype);
       }
   }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy