All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.gdal.GDALParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.parser.gdal;

//JDK imports
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN;

//Tika imports
//SAX imports

/**
 * Wraps execution of the Geospatial Data Abstraction
 * Library (GDAL) gdalinfo tool used to extract geospatial
 * information out of hundreds of geo file formats.
 * 

* The parser requires the installation of GDAL and for gdalinfo to * be located on the path. *

* Basic information (Size, Coordinate System, Bounding Box, Driver, and * resource info) are extracted as metadata, and the remaining metadata patterns * are extracted and added. *

* The output of the command is available from the provided * {@link ContentHandler} in the * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method. */ public class GDALParser extends AbstractParser { private static final long serialVersionUID = -3869130527323941401L; private static final Logger LOG = LoggerFactory.getLogger(GDALParser.class); private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( Arrays.asList( MediaType.application("x-netcdf"), MediaType.application("vrt"), MediaType.image("geotiff"), MediaType.image("nitf"), MediaType.application("x-rpf-toc"), MediaType.application("x-ecrg-toc"), MediaType.image("hfa"), MediaType.image("sar-ceos"), MediaType.image("ceos"), MediaType.application("jaxa-pal-sar"), MediaType.application("gff"), MediaType.application("elas"), MediaType.application("aig"), MediaType.application("aaigrid"), MediaType.application("grass-ascii-grid"), MediaType.application("sdts-raster"), MediaType.application("dted"), MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("raster"), MediaType.application("jdem"), MediaType.image("gif"), MediaType.image("big-gif"), MediaType.image("envisat"), MediaType.image("fits"), MediaType.application("fits"), MediaType.image("bsb"), MediaType.application("xpm"), MediaType.image("bmp"), MediaType.image("x-dimap"), MediaType.image("x-airsar"), MediaType.application("x-rs2"), MediaType.application("x-pcidsk"), MediaType.application("pcisdk"), MediaType.image("x-pcraster"), MediaType.image("ilwis"), MediaType.image("sgi"), MediaType.application("x-srtmhgt"), MediaType.application("leveller"), MediaType.application("terragen"), MediaType.application("x-gmt"), MediaType.application("x-isis3"), MediaType.application("x-isis2"), MediaType.application("x-pds"), MediaType.application("x-til"), MediaType.application("x-ers"), MediaType.application("x-l1b"), MediaType.image("fit"), MediaType.application("x-grib"), MediaType.image("jp2"), MediaType.application("x-rmf"), MediaType.application("x-wcs"), MediaType.application("x-wms"), MediaType.application("x-msgn"), MediaType.application("x-wms"), MediaType.application("x-wms"), MediaType.application("x-rst"), MediaType.application("x-ingr"), MediaType.application("x-gsag"), MediaType.application("x-gsbg"), MediaType.application("x-gs7bg"), MediaType.application("x-cosar"), MediaType.application("x-tsx"), MediaType.application("x-coasp"), MediaType.application("x-r"), MediaType.application("x-map"), MediaType.application("x-pnm"), MediaType.application("x-doq1"), MediaType.application("x-doq2"), MediaType.application("x-envi"), MediaType.application("x-envi-hdr"), MediaType.application("x-generic-bin"), MediaType.application("x-p-aux"), MediaType.image("x-mff"), MediaType.image("x-mff2"), MediaType.image("x-fujibas"), MediaType.application("x-gsc"), MediaType.application("x-fast"), MediaType.application("x-bt"), MediaType.application("x-lan"), MediaType.application("x-cpg"), MediaType.image("ida"), MediaType.application("x-ndf"), MediaType.image("eir"), MediaType.application("x-dipex"), MediaType.application("x-lcp"), MediaType.application("x-gtx"), MediaType.application("x-los-las"), MediaType.application("x-ntv2"), MediaType.application("x-ctable2"), MediaType.application("x-ace2"), MediaType.application("x-snodas"), MediaType.application("x-kro"), MediaType.image("arg"), MediaType.application("x-rik"), MediaType.application("x-usgs-dem"), MediaType.application("x-gxf"), MediaType.application("x-dods"), MediaType.application("x-http"), MediaType.application("x-bag"), MediaType.application("x-hdf"), MediaType.image("x-hdf5-image"), MediaType.application("x-nwt-grd"), MediaType.application("x-nwt-grc"), MediaType.image("adrg"), MediaType.image("x-srp"), MediaType.application("x-blx"), MediaType.application("x-rasterlite"), MediaType.application("x-epsilon"), MediaType.application("x-sdat"), MediaType.application("x-kml"), MediaType.application("x-xyz"), MediaType.application("x-geo-pdf"), MediaType.image("x-ozi"), MediaType.application("x-ctg"), MediaType.application("x-e00-grid"), MediaType.application("x-zmap"), MediaType.application("x-webp"), MediaType.application("x-ngs-geoid"), MediaType.application("x-mbtiles"), MediaType.application("x-ppi"), MediaType.application("x-cappi")))); private String command; public GDALParser() { setCommand("gdalinfo ${INPUT}"); } public void setCommand(String command) { this.command = command; } public String getCommand() { return this.command; } public String processCommand(InputStream stream) { TikaInputStream tis = (TikaInputStream) stream; String pCommand = this.command; try { if (this.command.contains(INPUT_FILE_TOKEN)) { pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile() .getPath()); } } catch (Exception e) { LOG.warn("exception processing command", e); } return pCommand; } @Override public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (!ExternalParser.check("gdalinfo")) { return; } // first set up and run GDAL // process the command TemporaryResources tmp = new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp); String runCommand = processCommand(tis); String output = execCommand(new String[]{runCommand}); // now extract the actual metadata params // from the GDAL output in the content stream // to do this, we need to literally process the output // from the invoked command b/c we can't read metadata and // output text from the handler in ExternalParser // at the same time, so for now, we can't use the // ExternalParser to do this and I've had to bring some of // that functionality directly into this class // TODO: investigate a way to do both using ExternalParser extractMetFromOutput(output, metadata); applyPatternsToOutput(output, metadata, getPatterns()); // make the content handler and provide output there // now that we have metadata processOutput(handler, metadata, output); } private Map getPatterns() { Map patterns = new HashMap(); this.addPatternWithColon("Driver", patterns); this.addPatternWithColon("Files", patterns); this.addPatternWithIs("Size", patterns); this.addPatternWithIs("Coordinate System", patterns); this.addBoundingBoxPattern("Upper Left", patterns); this.addBoundingBoxPattern("Lower Left", patterns); this.addBoundingBoxPattern("Upper Right", patterns); this.addBoundingBoxPattern("Lower Right", patterns); return patterns; } private void addPatternWithColon(String name, Map patterns) { patterns.put( Pattern.compile(name + "\\:\\s*([A-Za-z0-9/ _\\-\\.]+)\\s*"), name); } private void addPatternWithIs(String name, Map patterns) { patterns.put(Pattern.compile(name + " is ([A-Za-z0-9\\.,\\s`']+)"), name); } private void addBoundingBoxPattern(String name, Map patterns) { patterns.put( Pattern.compile(name + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"), name); } private void extractMetFromOutput(String output, Metadata met) { Scanner scanner = new Scanner(output); String currentKey = null; String[] headings = {"Subdatasets", "Corner Coordinates"}; StringBuilder metVal = new StringBuilder(); while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.contains("=") || hasHeadings(line, headings)) { if (currentKey != null) { // time to flush this key and met val met.add(currentKey, metVal.toString()); } metVal.setLength(0); String[] lineToks = line.split("="); currentKey = lineToks[0].trim(); if (lineToks.length == 2) { metVal.append(lineToks[1]); } else { metVal.append(""); } } else { metVal.append(line); } } } private boolean hasHeadings(String line, String[] headings) { if (headings != null && headings.length > 0) { for (String heading : headings) { if (line.contains(heading)) { return true; } } return false; } else return false; } private void applyPatternsToOutput(String output, Metadata metadata, Map metadataPatterns) { Scanner scanner = new Scanner(output); while (scanner.hasNextLine()) { String line = scanner.nextLine(); for (Pattern p : metadataPatterns.keySet()) { Matcher m = p.matcher(line); if (m.find()) { if (metadataPatterns.get(p) != null && !metadataPatterns.get(p).equals("")) { metadata.add(metadataPatterns.get(p), m.group(1)); } else { metadata.add(m.group(1), m.group(2)); } } } } } private String execCommand(String[] cmd) throws IOException { // Execute Process process; String output = null; if (cmd.length == 1) { process = Runtime.getRuntime().exec(cmd[0]); } else { process = Runtime.getRuntime().exec(cmd); } try { InputStream out = process.getInputStream(); try { output = extractOutput(out); } catch (Exception e) { LOG.warn("Exception extracting output", e); output = ""; } } finally { try { process.waitFor(); } catch (InterruptedException ignore) { } } return output; } private String extractOutput(InputStream stream) throws SAXException, IOException { StringBuilder sb = new StringBuilder(); try (Reader reader = new InputStreamReader(stream, UTF_8)) { char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { sb.append(buffer, 0, n); } } return sb.toString(); } private void processOutput(ContentHandler handler, Metadata metadata, String output) throws SAXException, IOException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8)); try (Reader reader = new InputStreamReader(stream, UTF_8)) { xhtml.startDocument(); xhtml.startElement("p"); char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { xhtml.characters(buffer, 0, n); } xhtml.endElement("p"); } finally { xhtml.endDocument(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy