All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.external.ExternalParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.external;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.NullOutputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Parser that uses an external program (like catdoc or pdf2txt) to extract
 *  text content and metadata from a given document.
 */
public class ExternalParser extends AbstractParser {

    /**
     * Consumer contract
     * @since Apache Tika 1.14
     */
    public interface LineConsumer extends Serializable {
        /**
         * Consume a line
         * @param line a line of string
         */
        void consume(String line);

        /**
         * A null consumer
         */
        LineConsumer NULL = new LineConsumer() {
            @Override
            public void consume(String line) {
                // ignores
            }
        };
    }

    private static final long serialVersionUID = -1079128990650687037L;
    
    /**
     * The token, which if present in the Command string, will
     *  be replaced with the input filename. 
     * Alternately, the input data can be streamed over STDIN.
     */
    public static final String INPUT_FILE_TOKEN = "${INPUT}";
    /**
     * The token, which if present in the Command string, will
     *  be replaced with the output filename. 
     * Alternately, the output data can be collected on STDOUT.
     */
    public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";

    /**
     * Media types supported by the external program.
     */
    private Set supportedTypes = Collections.emptySet();
    
    /**
     * Regular Expressions to run over STDOUT to
     *  extract Metadata.
     */
    private Map metadataPatterns = null;

    /**
     * The external command to invoke.
     * @see Runtime#exec(String[])
     */
    private String[] command = new String[] { "cat" };

    /**
     * A consumer for ignored Lines
     */
    private LineConsumer ignoredLineConsumer = LineConsumer.NULL;

    public Set getSupportedTypes(ParseContext context) {
        return getSupportedTypes();
    }

    public Set getSupportedTypes() {
        return supportedTypes;
    }

    public void setSupportedTypes(Set supportedTypes) {
        this.supportedTypes =
            Collections.unmodifiableSet(new HashSet(supportedTypes));
    }


    public String[] getCommand() {
        return command;
    }

    /**
     * Sets the command to be run. This can include either of
     *  {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
     *  if the command needs filenames.
     * @see Runtime#exec(String[])
     */
    public void setCommand(String... command) {
        this.command = command;
    }

    /**
     * Gets lines consumer
     * @return consumer instance
     */
    public LineConsumer getIgnoredLineConsumer() {
        return ignoredLineConsumer;
    }

    /**
     * Set a consumer for the lines ignored by the parse functions
     * @param ignoredLineConsumer consumer instance
     */
    public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) {
        this.ignoredLineConsumer = ignoredLineConsumer;
    }

    public Map getMetadataExtractionPatterns() {
       return metadataPatterns;
    }
    
    /**
     * Sets the map of regular expression patterns and Metadata
     *  keys. Any matching patterns will have the matching
     *  metadata entries set.
     * Set this to null to disable Metadata extraction.
     */
    public void setMetadataExtractionPatterns(Map patterns) {
       this.metadataPatterns = patterns;
    }
    

    /**
     * Executes the configured external command and passes the given document
     *  stream as a simple XHTML document to the given SAX content handler.
     * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
     *  has been called to set patterns.
     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);

        TemporaryResources tmp = new TemporaryResources();
        try {
            parse(TikaInputStream.get(stream, tmp),
                    xhtml, metadata, tmp);
        } finally {
            tmp.dispose();
        }
    }

    private void parse(
            TikaInputStream stream, XHTMLContentHandler xhtml,
            Metadata metadata, TemporaryResources tmp)
            throws IOException, SAXException, TikaException {
        boolean inputToStdIn = true;
        boolean outputFromStdOut = true;
        boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());

        File output = null;

        // Build our command
        String[] cmd;
        if (command.length == 1) {
            cmd = command[0].split(" ");
        } else {
            cmd = new String[command.length];
            System.arraycopy(command, 0, cmd, 0, command.length);
        }
        for(int i=0; inot
     * closed by this method.
     *
     * @param process process
     * @param stream input stream
     */
    private void sendInput(final Process process, final InputStream stream) {
        Thread t = new Thread() {
            public void run() {
                OutputStream stdin = process.getOutputStream();
                try {
                    IOUtils.copy(stream, stdin);
                } catch (IOException e) {
                }
            }
        };
        t.start();
        try{
     	   t.join();
        }
        catch(InterruptedException ignore){}        
    }


    /**
     * Starts a thread that reads and discards the contents of the
     * standard stream of the given process. Potential exceptions
     * are ignored, and the stream is closed once fully processed.
     * Note: calling this starts a new thread and blocks the current(caller) thread until the new thread dies
     * @param stream stream to be ignored
     */
    private static void ignoreStream(final InputStream stream) {
        ignoreStream(stream, true);
    }

    /**
     * Starts a thread that reads and discards the contents of the
     * standard stream of the given process. Potential exceptions
     * are ignored, and the stream is closed once fully processed.
     * @param stream stream to sent to black hole (a k a null)
     * @param waitForDeath when {@code true} the caller thread will be blocked till the death of new thread.
     * @return The thread that is created and started
     */
    private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) {
        Thread t = new Thread() {
            public void run() {
                try {
                    IOUtils.copy(stream, new NullOutputStream());
                } catch (IOException e) {
                } finally {
                    IOUtils.closeQuietly(stream);
                }
            }
        };
        t.start();
        if (waitForDeath) {
            try {
                t.join();
            } catch (InterruptedException ignore) {}
        }
        return t;
    }
    
    private void extractMetadata(final InputStream stream, final Metadata metadata) {
       Thread t = new Thread() {
          public void run() {
             BufferedReader reader;
              reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
             try {
                String line;
                while ( (line = reader.readLine()) != null ) {
                    boolean consumed = false;
                   for(Pattern p : metadataPatterns.keySet()) {
                      Matcher m = p.matcher(line);
                      if(m.find()) {
                          consumed = true;
                    	 if (metadataPatterns.get(p) != null && 
                    			 !metadataPatterns.get(p).equals("")){
                                   metadata.add( metadataPatterns.get(p), m.group(1) );
                    	 }
                    	 else{
                    		 metadata.add( m.group(1), m.group(2));
                    	 }
                      }
                   }
                    if (!consumed) {
                        ignoredLineConsumer.consume(line);
                    }
                }
             } catch (IOException e) {
                 // Ignore
             } finally {
                IOUtils.closeQuietly(reader);
                IOUtils.closeQuietly(stream);
            }
          }
       };
	   t.start();
       try{
    	   t.join();
       }
       catch(InterruptedException ignore){}
    }
    
    /**
     * Checks to see if the command can be run. Typically used with
     *  something like "myapp --version" to check to see if "myapp"
     *  is installed and on the path.
     *  
     * @param checkCmd The check command to run
     * @param errorValue What is considered an error value? 
     */
    public static boolean check(String checkCmd, int... errorValue) {
       return check(new String[] {checkCmd}, errorValue);
    }

    public static boolean check(String[] checkCmd, int... errorValue) {
       if(errorValue.length == 0) {
          errorValue = new int[] { 127 };
       }
       
       try {
          Process process= Runtime.getRuntime().exec(checkCmd);
          Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(), false);
          Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false);
          stdErrSuckerThread.join();
          stdOutSuckerThread.join();
          int result = process.waitFor();
          for(int err : errorValue) {
             if(result == err) return false;
          }
          return true;
       } catch(IOException e) {
          // Some problem, command is there or is broken
          return false;
       } catch (InterruptedException ie) {
          // Some problem, command is there or is broken
          return false;
       } catch (SecurityException se) {
          // External process execution is banned by the security manager
          return false;
       } catch (Error err) {
           if (err.getMessage() != null && 
               (err.getMessage().contains("posix_spawn") || 
               err.getMessage().contains("UNIXProcess"))) {
               //"Error forking command due to JVM locale bug 
               //(see TIKA-1526 and SOLR-6387)"
               return false;
           }
           //throw if a different kind of error
           throw err;
       }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy