All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.gdal.GDALParser Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.gdal;

//JDK imports
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;

//Tika imports
//SAX imports

/**
 * Wraps execution of the Geospatial Data Abstraction
 * Library (GDAL) gdalinfo tool used to extract geospatial
 * information out of hundreds of geo file formats.
 * 

* The parser requires the installation of GDAL and for gdalinfo to * be located on the path. *

* Basic information (Size, Coordinate System, Bounding Box, Driver, and * resource info) are extracted as metadata, and the remaining metadata patterns * are extracted and added. *

* The output of the command is available from the provided * {@link ContentHandler} in the * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method. */ public class GDALParser extends AbstractParser { private static final long serialVersionUID = -3869130527323941401L; private static final Logger LOG = LoggerFactory.getLogger(GDALParser.class); private String command; public GDALParser() { setCommand("gdalinfo ${INPUT}"); } public void setCommand(String command) { this.command = command; } public String getCommand() { return this.command; } public String processCommand(InputStream stream) { TikaInputStream tis = (TikaInputStream) stream; String pCommand = this.command; try { if (this.command.contains(INPUT_FILE_TOKEN)) { pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile() .getPath()); } } catch (Exception e) { LOG.warn("exception processing command", e); } return pCommand; } @Override public Set getSupportedTypes(ParseContext context) { Set types = new HashSet(); types.add(MediaType.application("x-netcdf")); types.add(MediaType.application("vrt")); types.add(MediaType.image("geotiff")); types.add(MediaType.image("nitf")); types.add(MediaType.application("x-rpf-toc")); types.add(MediaType.application("x-ecrg-toc")); types.add(MediaType.image("hfa")); types.add(MediaType.image("sar-ceos")); types.add(MediaType.image("ceos")); types.add(MediaType.application("jaxa-pal-sar")); types.add(MediaType.application("gff")); types.add(MediaType.application("elas")); types.add(MediaType.application("aig")); types.add(MediaType.application("aaigrid")); types.add(MediaType.application("grass-ascii-grid")); types.add(MediaType.application("sdts-raster")); types.add(MediaType.application("dted")); types.add(MediaType.image("png")); types.add(MediaType.image("jpeg")); types.add(MediaType.image("raster")); types.add(MediaType.application("jdem")); types.add(MediaType.image("gif")); types.add(MediaType.image("big-gif")); types.add(MediaType.image("envisat")); types.add(MediaType.image("fits")); types.add(MediaType.application("fits")); types.add(MediaType.image("bsb")); types.add(MediaType.application("xpm")); types.add(MediaType.image("bmp")); types.add(MediaType.image("x-dimap")); types.add(MediaType.image("x-airsar")); types.add(MediaType.application("x-rs2")); types.add(MediaType.application("x-pcidsk")); types.add(MediaType.application("pcisdk")); types.add(MediaType.image("x-pcraster")); types.add(MediaType.image("ilwis")); types.add(MediaType.image("sgi")); types.add(MediaType.application("x-srtmhgt")); types.add(MediaType.application("leveller")); types.add(MediaType.application("terragen")); types.add(MediaType.application("x-gmt")); types.add(MediaType.application("x-isis3")); types.add(MediaType.application("x-isis2")); types.add(MediaType.application("x-pds")); types.add(MediaType.application("x-til")); types.add(MediaType.application("x-ers")); types.add(MediaType.application("x-l1b")); types.add(MediaType.image("fit")); types.add(MediaType.application("x-grib")); types.add(MediaType.image("jp2")); types.add(MediaType.application("x-rmf")); types.add(MediaType.application("x-wcs")); types.add(MediaType.application("x-wms")); types.add(MediaType.application("x-msgn")); types.add(MediaType.application("x-wms")); types.add(MediaType.application("x-wms")); types.add(MediaType.application("x-rst")); types.add(MediaType.application("x-ingr")); types.add(MediaType.application("x-gsag")); types.add(MediaType.application("x-gsbg")); types.add(MediaType.application("x-gs7bg")); types.add(MediaType.application("x-cosar")); types.add(MediaType.application("x-tsx")); types.add(MediaType.application("x-coasp")); types.add(MediaType.application("x-r")); types.add(MediaType.application("x-map")); types.add(MediaType.application("x-pnm")); types.add(MediaType.application("x-doq1")); types.add(MediaType.application("x-doq2")); types.add(MediaType.application("x-envi")); types.add(MediaType.application("x-envi-hdr")); types.add(MediaType.application("x-generic-bin")); types.add(MediaType.application("x-p-aux")); types.add(MediaType.image("x-mff")); types.add(MediaType.image("x-mff2")); types.add(MediaType.image("x-fujibas")); types.add(MediaType.application("x-gsc")); types.add(MediaType.application("x-fast")); types.add(MediaType.application("x-bt")); types.add(MediaType.application("x-lan")); types.add(MediaType.application("x-cpg")); types.add(MediaType.image("ida")); types.add(MediaType.application("x-ndf")); types.add(MediaType.image("eir")); types.add(MediaType.application("x-dipex")); types.add(MediaType.application("x-lcp")); types.add(MediaType.application("x-gtx")); types.add(MediaType.application("x-los-las")); types.add(MediaType.application("x-ntv2")); types.add(MediaType.application("x-ctable2")); types.add(MediaType.application("x-ace2")); types.add(MediaType.application("x-snodas")); types.add(MediaType.application("x-kro")); types.add(MediaType.image("arg")); types.add(MediaType.application("x-rik")); types.add(MediaType.application("x-usgs-dem")); types.add(MediaType.application("x-gxf")); types.add(MediaType.application("x-dods")); types.add(MediaType.application("x-http")); types.add(MediaType.application("x-bag")); types.add(MediaType.application("x-hdf")); types.add(MediaType.image("x-hdf5-image")); types.add(MediaType.application("x-nwt-grd")); types.add(MediaType.application("x-nwt-grc")); types.add(MediaType.image("adrg")); types.add(MediaType.image("x-srp")); types.add(MediaType.application("x-blx")); types.add(MediaType.application("x-rasterlite")); types.add(MediaType.application("x-epsilon")); types.add(MediaType.application("x-sdat")); types.add(MediaType.application("x-kml")); types.add(MediaType.application("x-xyz")); types.add(MediaType.application("x-geo-pdf")); types.add(MediaType.image("x-ozi")); types.add(MediaType.application("x-ctg")); types.add(MediaType.application("x-e00-grid")); types.add(MediaType.application("x-zmap")); types.add(MediaType.application("x-webp")); types.add(MediaType.application("x-ngs-geoid")); types.add(MediaType.application("x-mbtiles")); types.add(MediaType.application("x-ppi")); types.add(MediaType.application("x-cappi")); return types; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (!ExternalParser.check("gdalinfo")) { return; } // first set up and run GDAL // process the command TemporaryResources tmp = new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp); String runCommand = processCommand(tis); String output = execCommand(new String[]{runCommand}); // now extract the actual metadata params // from the GDAL output in the content stream // to do this, we need to literally process the output // from the invoked command b/c we can't read metadata and // output text from the handler in ExternalParser // at the same time, so for now, we can't use the // ExternalParser to do this and I've had to bring some of // that functionality directly into this class // TODO: investigate a way to do both using ExternalParser extractMetFromOutput(output, metadata); applyPatternsToOutput(output, metadata, getPatterns()); // make the content handler and provide output there // now that we have metadata processOutput(handler, metadata, output); } private Map getPatterns() { Map patterns = new HashMap(); this.addPatternWithColon("Driver", patterns); this.addPatternWithColon("Files", patterns); this.addPatternWithIs("Size", patterns); this.addPatternWithIs("Coordinate System", patterns); this.addBoundingBoxPattern("Upper Left", patterns); this.addBoundingBoxPattern("Lower Left", patterns); this.addBoundingBoxPattern("Upper Right", patterns); this.addBoundingBoxPattern("Lower Right", patterns); return patterns; } private void addPatternWithColon(String name, Map patterns) { patterns.put( Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"), name); } private void addPatternWithIs(String name, Map patterns) { patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"), name); } private void addBoundingBoxPattern(String name, Map patterns) { patterns.put( Pattern.compile(name + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"), name); } private void extractMetFromOutput(String output, Metadata met) { Scanner scanner = new Scanner(output); String currentKey = null; String[] headings = {"Subdatasets", "Corner Coordinates"}; StringBuilder metVal = new StringBuilder(); while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.contains("=") || hasHeadings(line, headings)) { if (currentKey != null) { // time to flush this key and met val met.add(currentKey, metVal.toString()); } metVal.setLength(0); String[] lineToks = line.split("="); currentKey = lineToks[0].trim(); if (lineToks.length == 2) { metVal.append(lineToks[1]); } else { metVal.append(""); } } else { metVal.append(line); } } } private boolean hasHeadings(String line, String[] headings) { if (headings != null && headings.length > 0) { for (String heading : headings) { if (line.contains(heading)) { return true; } } return false; } else return false; } private void applyPatternsToOutput(String output, Metadata metadata, Map metadataPatterns) { Scanner scanner = new Scanner(output); while (scanner.hasNextLine()) { String line = scanner.nextLine(); for (Pattern p : metadataPatterns.keySet()) { Matcher m = p.matcher(line); if (m.find()) { if (metadataPatterns.get(p) != null && !metadataPatterns.get(p).equals("")) { metadata.add(metadataPatterns.get(p), m.group(1)); } else { metadata.add(m.group(1), m.group(2)); } } } } } private String execCommand(String[] cmd) throws IOException { // Execute Process process; String output = null; if (cmd.length == 1) { process = Runtime.getRuntime().exec(cmd[0]); } else { process = Runtime.getRuntime().exec(cmd); } try { InputStream out = process.getInputStream(); try { output = extractOutput(out); } catch (Exception e) { LOG.warn("Exception extracting output", e); output = ""; } } finally { try { process.waitFor(); } catch (InterruptedException ignore) { } } return output; } private String extractOutput(InputStream stream) throws SAXException, IOException { StringBuilder sb = new StringBuilder(); try (Reader reader = new InputStreamReader(stream, UTF_8)) { char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { sb.append(buffer, 0, n); } } return sb.toString(); } private void processOutput(ContentHandler handler, Metadata metadata, String output) throws SAXException, IOException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8)); try (Reader reader = new InputStreamReader(stream, UTF_8)) { xhtml.startDocument(); xhtml.startElement("p"); char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { xhtml.characters(buffer, 0, n); } xhtml.endElement("p"); } finally { xhtml.endDocument(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy