All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.executable.ExecutableParser Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.executable;

import java.io.IOException;
import java.io.InputStream;
import java.sql.Date;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Parser for executable files. Currently supports ELF and PE
 */
public class ExecutableParser extends AbstractParser implements MachineMetadata {
    /** Serial version UID */
    private static final long serialVersionUID = 32128791892482l;

    private static final MediaType PE_EXE = MediaType.application("x-msdownload");
    private static final MediaType ELF_GENERAL = MediaType.application("x-elf");
    private static final MediaType ELF_OBJECT = MediaType.application("x-object");
    private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
    private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
    private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
    private static final Set SUPPORTED_TYPES =
            Collections.unmodifiableSet(new HashSet(Arrays.asList(
            		PE_EXE,
                  ELF_GENERAL,
                  ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP
            )));
    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // We only do metadata, for now
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // What kind is it?
        byte[] first4 = new byte[4];
        IOUtils.readFully(stream, first4);
        
        if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
           parsePE(xhtml, metadata, stream, first4);
        } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
                   first4[2] == (byte)'L' && first4[3] == (byte)'F') {
           parseELF(xhtml, metadata, stream, first4);
        }
        
        
        // Finish everything
        xhtml.endDocument();
    }

    /**
     * Parses a DOS or Windows PE file
     */
    public void parsePE(XHTMLContentHandler xhtml, Metadata metadata,
          InputStream stream, byte[] first4) throws TikaException, IOException {
       metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
       metadata.set(PLATFORM, PLATFORM_WINDOWS);
       
       // Skip over the MS-DOS bit
       byte[] msdosSection = new byte[0x3c-4];
       IOUtils.readFully(stream, msdosSection);
       
       // Grab the PE header offset
       int peOffset = LittleEndian.readInt(stream);
       
       // Sanity check - while it may go anywhere, it's normally in the first few kb
       if (peOffset > 4096 || peOffset < 0x3f) return;
       
       // Skip the rest of the MS-DOS stub (if PE), until we reach what should
       //  be the PE header (if this is a PE executable)
       stream.skip(peOffset - 0x40);
       
       // Read the PE header
       byte[] pe = new byte[24];
       IOUtils.readFully(stream, pe);
       
       // Check it really is a PE header
       if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) {
          // Good, has a valid PE signature
       } else {
          // Old style MS-DOS
          return;
       }
       
       // Read the header values
       int machine    = LittleEndian.getUShort(pe, 4);
       int numSectors = LittleEndian.getUShort(pe, 6);
       long createdAt = LittleEndian.getInt(pe, 8);
       long symbolTableOffset = LittleEndian.getInt(pe, 12);
       long numSymbols = LittleEndian.getInt(pe, 16);
       int sizeOptHdrs = LittleEndian.getUShort(pe, 20);
       int characteristcs = LittleEndian.getUShort(pe, 22);
       
       // Turn this into helpful metadata
       Date createdAtD = new Date(createdAt*1000l);
       metadata.set(Metadata.CREATION_DATE, createdAtD);
       
       switch(machine) {
         case 0x14c:
            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;
         case 0x8664:
            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "64");
            break;
         case 0x200:
            metadata.set(MACHINE_TYPE, MACHINE_IA_64);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "64");
            break;
            
         case 0x184:
            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;
         case 0x284:
            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "64");
            break;
            
         case 0x1c0:
         case 0x1c4:
            metadata.set(MACHINE_TYPE, MACHINE_ARM);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;

         case 0x268:
            metadata.set(MACHINE_TYPE, MACHINE_M68K);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;

         case 0x266:
         case 0x366:
         case 0x466:
            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "16");
            break;
         case 0x162:
         case 0x166:
         case 0x168:
         case 0x169:
            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "16");
            break;
            
         case 0x1f0:
         case 0x1f1:
            metadata.set(MACHINE_TYPE, MACHINE_PPC);
            metadata.set(ENDIAN, Endian.LITTLE.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;
            
         case 0x1a2:
         case 0x1a3:
            metadata.set(MACHINE_TYPE, MACHINE_SH3);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;
         case 0x1a6:
            metadata.set(MACHINE_TYPE, MACHINE_SH4);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;
         case 0x1a8:
            metadata.set(MACHINE_TYPE, MACHINE_SH3);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;

         case 0x9041:
            metadata.set(MACHINE_TYPE, MACHINE_M32R);
            metadata.set(ENDIAN, Endian.BIG.getName());
            metadata.set(ARCHITECTURE_BITS, "32");
            break;

         case 0xebc:
            metadata.set(MACHINE_TYPE, MACHINE_EFI);
            break;

         default:
            metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN);
            break;
       }
    }

    /**
     * Parses a Unix ELF file
     */
    public void parseELF(XHTMLContentHandler xhtml, Metadata metadata,
          InputStream stream, byte[] first4) throws TikaException, IOException {
       // Byte 5 is the architecture
       int architecture = stream.read();
       if (architecture == 1) {
          metadata.set(ARCHITECTURE_BITS, "32");
       } else if (architecture == 2) {
          metadata.set(ARCHITECTURE_BITS, "64");          
       }
       
       // Byte 6 is the endian-ness
       int endian = stream.read();
       if (endian == 1) {
          metadata.set(ENDIAN, Endian.LITTLE.getName());
       } else if (endian == 2) {
          metadata.set(ENDIAN, Endian.BIG.getName());
       }
       
       // Byte 7 is the elf version
       int elfVer = stream.read();
       
       // Byte 8 is the OS, if set (lots of compilers don't)
       // Byte 9 is the OS (specific) ABI version
       int os = stream.read();
       int osVer = stream.read();
       if (os > 0 || osVer > 0)
       {
          switch (os) {
          case 0:
             metadata.set(PLATFORM, PLATFORM_SYSV);
             break;

          case 1:
             metadata.set(PLATFORM, PLATFORM_HPUX);
             break;

          case 2:
             metadata.set(PLATFORM, PLATFORM_NETBSD);
             break;

          case 3:
             metadata.set(PLATFORM, PLATFORM_LINUX);
             break;

          case 6:
             metadata.set(PLATFORM, PLATFORM_SOLARIS);
             break;

          case 7:
             metadata.set(PLATFORM, PLATFORM_AIX);
             break;

          case 8:
             metadata.set(PLATFORM, PLATFORM_IRIX);
             break;

          case 9:
             metadata.set(PLATFORM, PLATFORM_FREEBSD);
             break;

          case 10:
             metadata.set(PLATFORM, PLATFORM_TRU64);
             break;

          case 12:
             metadata.set(PLATFORM, PLATFORM_FREEBSD);
             break;

          case 64:
          case 97:
             metadata.set(PLATFORM, PLATFORM_ARM);
             break;

          case 255:
             metadata.set(PLATFORM, PLATFORM_EMBEDDED);
             break;
          }
       }
       
       // Bytes 10-16 are padding and lengths
       byte[] padLength = new byte[7];
       IOUtils.readFully(stream, padLength);
       
       // Bytes 16-17 are the object type (LE/BE)
       int type;
       if (endian == 1) {
          type = EndianUtils.readUShortLE(stream);
       } else {
          type = EndianUtils.readUShortBE(stream);
       }
       switch(type) {
         case 1:
            metadata.set(Metadata.CONTENT_TYPE, ELF_OBJECT.toString());
            break;
            
         case 2:
            metadata.set(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString());
            break;
            
         case 3:
            metadata.set(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString());
            break;
            
         case 4:
            metadata.set(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString());
            break;
            
         default:
            metadata.set(Metadata.CONTENT_TYPE, ELF_GENERAL.toString());
            break;
       }
                 
       // Bytes 18-19 are the machine (EM_*)
       int machine;
       if (endian == 1) {
          machine = EndianUtils.readUShortLE(stream);
       } else {
          machine = EndianUtils.readUShortBE(stream);
       }
       switch(machine) {
         case 2:
         case 18:
         case 43:
            metadata.set(MACHINE_TYPE, MACHINE_SPARC);
            break;
         case 3:
            metadata.set(MACHINE_TYPE, MACHINE_x86_32);
            break;
         case 4:
            metadata.set(MACHINE_TYPE, MACHINE_M68K);
            break;
         case 5:
            metadata.set(MACHINE_TYPE, MACHINE_M88K);
            break;
         case 8:
         case 10:
            metadata.set(MACHINE_TYPE, MACHINE_MIPS);
            break;
         case 7:
            metadata.set(MACHINE_TYPE, MACHINE_S370);
            break;
         case 20:
         case 21:
            metadata.set(MACHINE_TYPE, MACHINE_PPC);
            break;
         case 22:
            metadata.set(MACHINE_TYPE, MACHINE_S390);
            break;
         case 40:
            metadata.set(MACHINE_TYPE, MACHINE_ARM);
            break;
         case 41:
         case 0x9026:
            metadata.set(MACHINE_TYPE, MACHINE_ALPHA);
            break;
         case 50:
            metadata.set(MACHINE_TYPE, MACHINE_IA_64);
            break;
         case 62:
            metadata.set(MACHINE_TYPE, MACHINE_x86_64);
            break;
         case 75:
            metadata.set(MACHINE_TYPE, MACHINE_VAX);
            break;
         case 88:
            metadata.set(MACHINE_TYPE, MACHINE_M32R);
            break;
       }
       
       
       
       // Bytes 20-23 are the version
       // TODO
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy