All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.medsea.mimeutil.detector.MagicMimeMimeDetector Maven / Gradle / Ivy

/*
 * Copyright 2007-2009 Medsea Business Solutions S.L.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.medsea.mimeutil.detector;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.medsea.mimeutil.MimeException;
import eu.medsea.mimeutil.MimeUtil;

/**
 * The magic mime rules files are loaded in the following way.
 * 
    *
  1. From a JVM system property magic-mime i.e * -Dmagic-mime=../my/magic/mime/rules
  2. *
  3. From any file named magic.mime that can be found on the * classpath
  4. *
  5. From a file named .magic.mime in the users home directory
  6. *
  7. From the normal Unix locations /usr/share/file/magic.mime * and /etc/magic.mime (in that order)
  8. *
  9. From the internal magic.mime file * eu.medsea.mimeutil.magic.mime if, and only if, no files are * located in step 4 above.
  10. *
* Each rule file is appended to the end of the existing rules so the earlier in * the sequence you define a rule means this will take precedence over rules * loaded later.

*

* You can add new mime mapping rules using the syntax defined for the Unix * magic.mime file by placing these rules in any of the files or locations * listed above. You can also change an existing mapping rule by redefining the * existing rule in one of the files listed above. This is handy for some of the * more sketchy rules defined in the existing Unix magic.mime files. *

* We extended the string type rule which allows you to match strings in a file * where you do not know the actual offset of the string containing magic file * information it goes something like ?what I am looking for will be ?somewhere? * within the next n characters? from this location. This is an important * improvement to the string matching rules especially for text based documents * such as HTML and XML formats. The reasoning for this was that the rules for * matching SVG images defined in the original 'magic.mime' file hardly ever * worked, this is because of the fixed offset definitions within the magic rule * format. As XML documents generally have an XML declaration that can contain * various optional attributes the length of this header often cannot be * determined, therefore we cannot know that the DOCTYPE declaration for an SVG * xml file starts at ?this? location, all we can say is that, if this is an SVG * xml file then it will have an SVG DOCTYPE somewhere near the beginning of the * file and probably within the first 1024 characters. So we test for the xml * declaration and then we test for the DOCTYPE within a specified number of * characters and if found then we match this rule. This extension can be used * to better identify ALL of the XML type mime mappings in the current * 'magic.mime' file. Remember though, as we stated earlier mime type matching * using any of the mechanisms supported is not an exact science and should * always be viewed as a 'best guess' and not as a 'definite match'. *

*

* An example of overriding the PNG and SVG rules can be found in our internal * 'magic.mime' file located in the test_files directory (this file is NOT used * when locating rules and is used for testing purposes only). This PNG rule * overrides the original PNG rule defined in the 'magic.mime' file we took from * the Internet, and the SVG rule overrides the SVG detection also defined in * the original 'magic.mime' file *

*

* *

 * #PNG Image Format
 * 0		string		\211PNG\r\n\032\n		image/png
 *
 * #SVG Image Format
 * #	We know its an XML file so it should start with an XML declaration.
 * 0	string	\<?xml\ version=	text/xml
 * #	As the XML declaration in an XML file can be short or extended we cannot know
 * #	exactly where the declaration ends i.e. how long it is,
 * #	also it could be terminated by a new line(s) or a space(s).
 * #	So the next line states that somewhere after the 15th character position we should find the DOCTYPE declaration.
 * #	This DOCTYPE declaration should be within 1024 characters from the 15th character
 * >15	string>1024<	\<!DOCTYPE\ svg\ PUBLIC\ "-//W3C//DTD\ SVG 	image/svg+xml
 * 
* *

*

* As you can see the extension is defined using the syntax string>bufsize<. It * can only be used on a string type and basically means match this within * bufsize character from the position defined at the beginning of the line. * This rule is much more verbose than required as we really only need to check * for the presence of SVG. As we said earlier, this is a test case file and not * used by the utility under normal circumstances. * * The test mime-types.properties and magic.mime files we use can be located in * the test_files directory of this distribution. *

*

* We use the application/directory mime type to identify * directories. Even though this is not an official mime type it seems to be * well accepted on the net as an unofficial mime type so we thought it was OK * for us to use as well. *

*

* This class is auto loaded by MimeUtil as it has an entry in the file called * MimeDetectors. MimeUtil reads this file at startup and calls Class.forName() * on each entry found. This mean the MimeDetector must have a no arg * constructor. *

* * @author Steven McArdle. * */ public class MagicMimeMimeDetector extends MimeDetector { private static Log log = LogFactory.getLog(MagicMimeMimeDetector.class); private static String[] defaultLocations = { "/usr/share/mimelnk/magic", "/usr/share/file/magic.mime", "/etc/magic.mime" }; private static List magicMimeFileLocations = Arrays .asList(defaultLocations); private static ArrayList mMagicMimeEntries = new ArrayList(); // Initialise this MimeDetector and automatically register it with MimeUtil static { initMagicRules(); MimeUtil.addMimeDetector(new MagicMimeMimeDetector()); } public MagicMimeMimeDetector() { } public String getDescription() { return "Get the mime types of files or streams using the Unix file(5) magic.mime files"; } /** * Get the mime types that may be contained in the data array. * * @param data. The byte array that contains data we want to detect mime types from. * @return the mime types. * @throws MimeException if for instance we try to match beyond the end of the data. */ public Collection getMimeTypesByteArray(final byte[] data) throws MimeException { Collection mimeTypes = new HashSet(); int len = mMagicMimeEntries.size(); try { for (int i = 0; i < len; i++) { MagicMimeEntry me = (MagicMimeEntry) mMagicMimeEntries.get(i); MagicMimeEntry matchingMagicMimeEntry = me.getMatch(data); if (matchingMagicMimeEntry != null) { mimeTypes.add(matchingMagicMimeEntry.getMimeType()); } } } catch (Exception e) { log.error(e.getMessage(), e); } return mimeTypes; } /** * Get the mime types of the data in the specified {@link InputStream}. * Therefore, the InputStream must support mark and reset (see * {@link InputStream#markSupported()}). If it does not support mark and * reset, an {@link MimeException} is thrown. * * @param in * the stream from which to read the data. * @return the mime types. * @throws MimeException * if the specified InputStream does not support * mark and reset (see {@link InputStream#markSupported()}). */ public Collection getMimeTypesInputStream(final InputStream in) throws MimeException { Collection mimeTypes = new HashSet(); int len = mMagicMimeEntries.size(); try { for (int i = 0; i < len; i++) { MagicMimeEntry me = (MagicMimeEntry) mMagicMimeEntries.get(i); MagicMimeEntry matchingMagicMimeEntry = me.getMatch(in); if (matchingMagicMimeEntry != null) { mimeTypes.add(matchingMagicMimeEntry.getMimeType()); } } } catch (Exception e) { log.error(e.getMessage(), e); } return mimeTypes; } /** * Try to get the mime types of a file using the magic.mime * rules files. * * @param file * is a {@link File} object that points to a file or directory. * @return the mime types. * @throws MimeException * if the file cannot be parsed. */ public Collection getMimeTypesFile(final File file) throws MimeException { Collection mimeTypes = new HashSet(); if (!file.exists()) { return mimeTypes; } if (file.isDirectory()) { mimeTypes.add(MimeUtil.DIRECTORY_MIME_TYPE); return mimeTypes; } int len = mMagicMimeEntries.size(); RandomAccessFile raf = null; try { raf = new RandomAccessFile(file, "r"); for (int i = 0; i < len; i++) { MagicMimeEntry me = (MagicMimeEntry) mMagicMimeEntries.get(i); MagicMimeEntry matchingMagicMimeEntry = me.getMatch(raf); if (matchingMagicMimeEntry != null) { mimeTypes.add(matchingMagicMimeEntry.getMimeType()); } } } catch (Exception e) { throw new MimeException("Error parsing file [" + file.getAbsolutePath() + "]", e); } finally { if (raf != null) { try { raf.close(); } catch (Exception e) { log.error("Error closing file.", e); } } } return mimeTypes; } /* * This loads the magic.mime file rules into the internal parse tree in the * following order 1. From any magic.mime that can be located on the * classpath 2. From any magic.mime file that can be located using the * environment variable MAGIC 3. From any magic.mime located in the users * home directory ~/.magic.mime file if the MAGIC environment variable is * not set 4. From the locations defined in the magicMimeFileLocations and * the order defined 5. From the internally defined magic.mime file ONLY if * we are unable to locate any of the files in steps 2 - 5 above Thanks go * to Simon Pepping for his bug report */ private static void initMagicRules() { InputStream is = null; // Try to locate a magic.mime file locate by system property magic-mime try { String fname = System.getProperty("magic-mime"); if (fname != null && fname.length() != 0) { is = new FileInputStream(fname); try { if (is != null) { parse("-Dmagic-mime=" + fname, new InputStreamReader(is)); } } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } } catch (Exception e) { log .error( "Failed to parse custom magic mime file defined by system property -Dmagic-mime [" + System.getProperty("magic-mime") + "]. File will be ignored.", e); } // Try to locate a magic.mime file on the classpath try { is = MimeUtil.class.getClassLoader().getResourceAsStream( "magic.mime"); try { if (is != null) { parse("classpath:magic-mime", new InputStreamReader(is)); } } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } catch (Exception e) { log .error( "Failed to parse magic.mime rule file on the classpath. File will be ignored.", e); } // Now lets see if we have one in the users home directory. This is // named .magic.mime as opposed to magic.mime try { File f = new File(System.getProperty("user.home") + File.separator + ".magic.mime"); if (f.exists()) { is = new FileInputStream(f); try { if (is != null) { parse(f.getAbsolutePath(), new InputStreamReader(is)); } } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } } catch (Exception e) { log .error( "Failed to parse .magic.mime file from the users home directory. File will be ignored.", e); } // Now lets see if we have an environment variable names MAGIC set. This // would normally point to a magic or magic.mgc file. // As we don't use these file types we will look to see if there is also // a magic.mime file at this location for us to use. try { String name = System.getProperty("MAGIC"); if (name != null && name.length() != 0) { // Strip the .mgc from the end if it's there and add the .mime // extension if (name.indexOf('.') < 0) { name = name + ".mime"; } else { // remove the mgc extension name = name.substring(0, name.indexOf('.') - 1) + "mime"; } File f = new File(name); if (f.exists()) { is = new FileInputStream(f); try { if (is != null) { parse(f.getAbsolutePath(), new InputStreamReader(is)); } } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } } } catch (Exception e) { log .error( "Failed to parse magic.mime file from directory located by environment variable MAGIC. File will be ignored.", e); } // Parse the UNIX magic(5) magic.mime files. Since there can be // multiple, we have to load all of them. // We save, how many entries we have now, in order to fall back to our // default magic.mime that we ship, // if no entries were read from the OS. int mMagicMimeEntriesSizeBeforeReadingOS = mMagicMimeEntries.size(); Iterator it = magicMimeFileLocations.iterator(); while (it.hasNext()) { parseMagicMimeFileLocation((String) it.next()); } if (mMagicMimeEntriesSizeBeforeReadingOS == mMagicMimeEntries.size()) { // Use the magic.mime that we ship try { String resource = "eu/medsea/mimeutil/magic.mime"; is = MimeUtil.class.getClassLoader().getResourceAsStream( resource); parse("resource:" + resource, new InputStreamReader(is)); } catch (Exception e) { log.error("Failed to process internal magic.mime file.", e); } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } } private static void parseMagicMimeFileLocation(final String location) { InputStream is = null; List magicMimeFiles = getMagicFilesFromMagicMimeFileLocation(location); for (Iterator itFile = magicMimeFiles.iterator(); itFile.hasNext();) { File f = (File) itFile.next(); try { is = new FileInputStream(f); if (f.exists()) { parse(f.getAbsolutePath(), new InputStreamReader(is)); } } catch (Exception e) { log.warn(e.getMessage()); } finally { if (is != null) { try { is.close(); } catch (Exception e) { // ignore, but log in debug mode if (log.isDebugEnabled()) log.debug(e.getMessage(), e); } is = null; } } } } private static List getMagicFilesFromMagicMimeFileLocation( final String magicMimeFileLocation) { List magicMimeFiles = new LinkedList(); if (magicMimeFileLocation.indexOf('*') < 0) { magicMimeFiles.add(new File(magicMimeFileLocation)); } else { int lastSlashPos = magicMimeFileLocation.lastIndexOf('/'); File dir; String fileNameSimplePattern; if (lastSlashPos < 0) { dir = new File("someProbablyNotExistingFile").getAbsoluteFile() .getParentFile(); fileNameSimplePattern = magicMimeFileLocation; } else { String dirName = magicMimeFileLocation.substring(0, lastSlashPos); if (dirName.indexOf('*') >= 0) throw new UnsupportedOperationException( "The wildcard '*' is not allowed in directory part of the location! Do you want to implement expressions like /path/**/*.mime for recursive search? Please do!"); dir = new File(dirName); fileNameSimplePattern = magicMimeFileLocation .substring(lastSlashPos + 1); } if (!dir.isDirectory()) return Collections.EMPTY_LIST; String s = fileNameSimplePattern.replaceAll("\\.", "\\\\."); s = s.replaceAll("\\*", ".*"); Pattern fileNamePattern = Pattern.compile(s); File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File file = files[i]; if (fileNamePattern.matcher(file.getName()).matches()) magicMimeFiles.add(file); } } return magicMimeFiles; } // Parse the magic.mime file private static void parse(final String magicFile, final Reader r) throws IOException { long start = System.currentTimeMillis(); BufferedReader br = new BufferedReader(r); String line; ArrayList sequence = new ArrayList(); long lineNumber = 0; line = br.readLine(); if (line != null) ++lineNumber; while (true) { if (line == null) { break; } line = line.trim(); if (line.length() == 0 || line.charAt(0) == '#') { line = br.readLine(); if (line != null) ++lineNumber; continue; } sequence.add(line); // read the following lines until a line does not begin with '>' or // EOF while (true) { line = br.readLine(); if (line != null) ++lineNumber; if (line == null) { addEntry(magicFile, lineNumber, sequence); sequence.clear(); break; } line = line.trim(); if (line.length() == 0 || line.charAt(0) == '#') { continue; } if (line.charAt(0) != '>') { addEntry(magicFile, lineNumber, sequence); sequence.clear(); break; } sequence.add(line); } } if (!sequence.isEmpty()) { addEntry(magicFile, lineNumber, sequence); } if (log.isDebugEnabled()) log.debug("Parsing \"" + magicFile + "\" took " + (System.currentTimeMillis() - start) + " msec."); } private static void addEntry(final String magicFile, final long lineNumber, final ArrayList aStringArray) { try { MagicMimeEntry magicEntry = new MagicMimeEntry(aStringArray); mMagicMimeEntries.add(magicEntry); // Add this to the list of known mime types as well if (magicEntry.getMimeType() != null) { MimeUtil.addKnownMimeType(magicEntry.getMimeType()); } } catch (InvalidMagicMimeEntryException e) { // Continue on but lets print an exception so people can see there // is a problem log.warn(e.getClass().getName() + ": " + e.getMessage() + ": file \"" + magicFile + "\": before or at line " + lineNumber, e); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy