org.apache.tika.gui.TikaGUI Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.gui;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.awt.CardLayout;
import java.awt.Color;
import java.awt.Dimension;
import java.awt.Toolkit;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowEvent;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.swing.Box;
import javax.swing.JDialog;
import javax.swing.JEditorPane;
import javax.swing.JFileChooser;
import javax.swing.JFrame;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTextPane;
import javax.swing.SwingUtilities;
import javax.swing.UIManager;
import javax.swing.event.HyperlinkEvent;
import javax.swing.event.HyperlinkEvent.EventType;
import javax.swing.event.HyperlinkListener;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.IOUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
* of the window to have them parsed.
*/
public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener {
//maximum length to allow for mark for reparse to get JSON
private static final int MAX_MARK = 20 * 1024 * 1024;//20MB
/**
* Serial version UID.
*/
private static final long serialVersionUID = 5883906936187059495L;
/**
* Parsing context.
*/
private final ParseContext context;
/**
* Configured parser instance.
*/
private final Parser parser;
/**
* Captures requested embedded images
*/
private final ImageSavingParser imageParser;
/**
* The card layout for switching between different views.
*/
private final CardLayout layout = new CardLayout();
/**
* Container for the editor cards.
*/
private final JPanel cards;
/**
* Formatted XHTML output.
*/
private final JEditorPane html;
/**
* Plain text output.
*/
private final JEditorPane text;
/**
* Main content output.
*/
private final JEditorPane textMain;
/**
* Raw XHTML source.
*/
private final JEditorPane xml;
/**
* Raw JSON source.
*/
private final JEditorPane json;
/**
* Document metadata.
*/
private final JEditorPane metadata;
/**
* File chooser.
*/
private final JFileChooser chooser = new JFileChooser();
public TikaGUI(Parser parser) {
super("Apache Tika");
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
addMenuBar();
cards = new JPanel(layout);
addWelcomeCard(cards, "welcome");
metadata = addCard(cards, "text/plain", "metadata");
html = addCard(cards, "text/html", "html");
text = addCard(cards, "text/plain", "text");
textMain = addCard(cards, "text/plain", "main");
xml = addCard(cards, "text/plain", "xhtml");
json = addCard(cards, "text/plain", "json");
add(cards);
layout.show(cards, "welcome");
setPreferredSize(new Dimension(640, 480));
pack();
this.context = new ParseContext();
this.parser = parser;
this.imageParser = new ImageSavingParser(parser);
this.context.set(DocumentSelector.class, new ImageDocumentSelector());
this.context.set(Parser.class, imageParser);
}
/**
* Main method. Sets the Swing look and feel to the operating system
* settings, and starts the Tika GUI with an {@link AutoDetectParser}
* instance as the default parser.
*
* @param args ignored
* @throws Exception if an error occurs
*/
public static void main(String[] args) throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
if (args.length > 0) {
File configFile = new File(args[0]);
config = new TikaConfig(configFile);
}
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(() -> new TikaGUI(
new DigestingParser(new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true));
}
private void addMenuBar() {
JMenuBar bar = new JMenuBar();
JMenu file = new JMenu("File");
file.setMnemonic(KeyEvent.VK_F);
addMenuItem(file, "Open...", "openfile", KeyEvent.VK_O);
addMenuItem(file, "Open URL...", "openurl", KeyEvent.VK_U);
file.addSeparator();
addMenuItem(file, "Exit", "exit", KeyEvent.VK_X);
bar.add(file);
JMenu view = new JMenu("View");
view.setMnemonic(KeyEvent.VK_V);
addMenuItem(view, "Metadata", "metadata", KeyEvent.VK_M);
addMenuItem(view, "Formatted text", "html", KeyEvent.VK_F);
addMenuItem(view, "Plain text", "text", KeyEvent.VK_P);
addMenuItem(view, "Main content", "main", KeyEvent.VK_C);
addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S);
addMenuItem(view, "Recursive JSON", "json", KeyEvent.VK_J);
bar.add(view);
bar.add(Box.createHorizontalGlue());
JMenu help = new JMenu("Help");
help.setMnemonic(KeyEvent.VK_H);
addMenuItem(help, "About Tika", "about", KeyEvent.VK_A);
bar.add(help);
setJMenuBar(bar);
}
private void addMenuItem(JMenu menu, String title, String command, int key) {
JMenuItem item = new JMenuItem(title, key);
item.setActionCommand(command);
item.addActionListener(this);
menu.add(item);
}
public void actionPerformed(ActionEvent e) {
String command = e.getActionCommand();
if ("openfile".equals(command)) {
int rv = chooser.showOpenDialog(this);
if (rv == JFileChooser.APPROVE_OPTION) {
openFile(chooser.getSelectedFile());
}
} else if ("openurl".equals(command)) {
Object rv = JOptionPane
.showInputDialog(this, "Enter the URL of the resource to be parsed:",
"Open URL", JOptionPane.PLAIN_MESSAGE, null, null, "");
if (rv != null && rv.toString().length() > 0) {
try {
openURL(new URL(rv.toString().trim()));
} catch (MalformedURLException exception) {
JOptionPane.showMessageDialog(this, "The given string is not a valid URL",
"Invalid URL", JOptionPane.ERROR_MESSAGE);
}
}
} else if ("html".equals(command)) {
layout.show(cards, command);
} else if ("text".equals(command)) {
layout.show(cards, command);
} else if ("main".equals(command)) {
layout.show(cards, command);
} else if ("xhtml".equals(command)) {
layout.show(cards, command);
} else if ("metadata".equals(command)) {
layout.show(cards, command);
} else if ("json".equals(command)) {
layout.show(cards, command);
} else if ("about".equals(command)) {
textDialog("About Apache Tika", TikaGUI.class.getResource("about.html"));
} else if ("exit".equals(command)) {
Toolkit.getDefaultToolkit().getSystemEventQueue()
.postEvent(new WindowEvent(this, WindowEvent.WINDOW_CLOSING));
}
}
public void openFile(File file) {
try {
Metadata metadata = new Metadata();
try (TikaInputStream stream = TikaInputStream.get(file, metadata)) {
handleStream(stream, metadata);
}
} catch (Throwable t) {
handleError(file.getPath(), t);
}
}
public void openURL(URL url) {
try {
Metadata metadata = new Metadata();
try (TikaInputStream stream = TikaInputStream.get(url, metadata)) {
handleStream(stream, metadata);
}
} catch (Throwable t) {
handleError(url.toString(), t);
}
}
private void handleStream(InputStream input, Metadata md) throws Exception {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
StringWriter textMainBuffer = new StringWriter();
StringWriter xmlBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler =
new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer),
getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
input = TikaInputStream.get(input);
if (input.markSupported()) {
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream) input).hasFile()) {
mark = (int) ((TikaInputStream) input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
for (String val : md.getValues(name)) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(val);
metadataBuffer.append("\n");
}
}
String name = md.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0) {
setTitle("Apache Tika: " + name);
} else {
setTitle("Apache Tika: unnamed document");
}
setText(metadata, metadataBuffer.toString());
setText(xml, xmlBuffer.toString());
setText(text, textBuffer.toString());
setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
if (!input.markSupported()) {
setText(json, "InputStream does not support mark/reset for Recursive Parsing");
layout.show(cards, "metadata");
return;
}
boolean isReset = false;
try {
input.reset();
isReset = true;
} catch (IOException e) {
setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK +
" bytes for this type of processing in the GUI.\n" +
"Try the app with command line argument of -J.");
}
if (isReset) {
RecursiveParserWrapperHandler recursiveParserWrapperHandler =
new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), -1);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
}
private void handleError(String name, Throwable t) {
StringWriter writer = new StringWriter();
writer.append("Apache Tika was unable to parse the document\n");
writer.append("at ").append(name).append(".\n\n");
writer.append("The full exception stack trace is included below:\n\n");
t.printStackTrace(new PrintWriter(writer));
JEditorPane editor = new JEditorPane("text/plain", writer.toString());
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
editor.setPreferredSize(new Dimension(600, 400));
JDialog dialog = new JDialog(this, "Apache Tika error");
dialog.add(new JScrollPane(editor));
dialog.pack();
dialog.setVisible(true);
}
private void addWelcomeCard(JPanel panel, String name) {
try {
JEditorPane editor = new JEditorPane(TikaGUI.class.getResource("welcome.html"));
editor.setContentType("text/html");
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setTransferHandler(
new ParsingTransferHandler(editor.getTransferHandler(), this));
panel.add(new JScrollPane(editor), name);
} catch (IOException e) {
e.printStackTrace();
}
}
private JEditorPane addCard(JPanel panel, String type, String name) {
JEditorPane editor = new JTextPane();
editor.setBackground(Color.WHITE);
editor.setContentType(type);
editor.setTransferHandler(new ParsingTransferHandler(editor.getTransferHandler(), this));
panel.add(new JScrollPane(editor), name);
return editor;
}
private void textDialog(String title, URL resource) {
try {
JDialog dialog = new JDialog(this, title);
JEditorPane editor = new JEditorPane(resource);
editor.setContentType("text/html");
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setPreferredSize(new Dimension(400, 250));
editor.addHyperlinkListener(this);
dialog.add(editor);
dialog.pack();
dialog.setVisible(true);
} catch (IOException e) {
e.printStackTrace();
}
}
public void hyperlinkUpdate(HyperlinkEvent e) {
if (e.getEventType() == EventType.ACTIVATED) {
try {
URL url = e.getURL();
try (InputStream stream = url.openStream()) {
JEditorPane editor =
new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
editor.setPreferredSize(new Dimension(600, 400));
String name = url.toString();
name = name.substring(name.lastIndexOf('/') + 1);
JDialog dialog = new JDialog(this, "Apache Tika: " + name);
dialog.add(new JScrollPane(editor));
dialog.pack();
dialog.setVisible(true);
}
} catch (IOException exception) {
exception.printStackTrace();
}
}
}
private void setText(JEditorPane editor, String text) {
editor.setText(text);
editor.setCaretPosition(0);
}
/**
* Creates and returns a content handler that turns XHTML input to
* simplified HTML output that can be correctly parsed and displayed
* by {@link JEditorPane}.
*
* The returned content handler is set to output html
* to the given writer. The XHTML namespace is removed from the output
* to prevent the serializer from using the <tag/> empty element
* syntax that causes extra ">" characters to be displayed.
* The <head> tags are dropped to prevent the serializer from
* generating a <META> content type tag that makes
* {@link JEditorPane} fail thinking that the document character set
* is inconsistent.
*
* Additionally, it will use ImageSavingParser to re-write embedded:(image)
* image links to be file:///(temporary file) so that they can be loaded.
*
* @param writer output writer
* @return HTML content handler
* @throws TransformerConfigurationException if an error occurs
*/
private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
return new ContentHandlerDecorator(handler) {
@Override
public void startElement(String uri, String localName, String name, Attributes atts)
throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
if ("img".equals(localName)) {
AttributesImpl newAttrs;
if (atts instanceof AttributesImpl) {
newAttrs = (AttributesImpl) atts;
} else {
newAttrs = new AttributesImpl(atts);
}
for (int i = 0; i < newAttrs.getLength(); i++) {
if ("src".equals(newAttrs.getLocalName(i))) {
String src = newAttrs.getValue(i);
if (src.startsWith("embedded:")) {
String filename = src.substring(src.indexOf(':') + 1);
try {
File img = imageParser.requestSave(filename);
String newSrc = img.toURI().toString();
newAttrs.setValue(i, newSrc);
} catch (IOException e) {
System.err.println(
"Error creating temp image file " + filename);
// The html viewer will show a broken image too to alert them
}
}
}
}
super.startElement(uri, localName, name, newAttrs);
} else {
super.startElement(uri, localName, name, atts);
}
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
super.endElement(uri, localName, name);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
}
@Override
public void endPrefixMapping(String prefix) {
}
};
}
private ContentHandler getTextContentHandler(Writer writer) {
return new BodyContentHandler(writer);
}
private ContentHandler getTextMainContentHandler(Writer writer) {
return new BoilerpipeContentHandler(writer);
}
private ContentHandler getXmlContentHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.setResult(new StreamResult(writer));
return handler;
}
/**
* A {@link DocumentSelector} that accepts only images.
*/
private static class ImageDocumentSelector implements DocumentSelector {
public boolean select(Metadata metadata) {
String type = metadata.get(Metadata.CONTENT_TYPE);
return type != null && type.startsWith("image/");
}
}
/**
* A recursive parser that saves certain images into the temporary
* directory, and delegates everything else to another downstream
* parser.
*/
private static class ImageSavingParser extends AbstractParser {
private Map wanted = new HashMap<>();
private Parser downstreamParser;
private File tmpDir;
private ImageSavingParser(Parser downstreamParser) {
this.downstreamParser = downstreamParser;
try {
File t = File.createTempFile("tika", ".test");
tmpDir = t.getParentFile();
} catch (IOException e) {
//swallow
}
}
public File requestSave(String embeddedName) throws IOException {
String suffix = ".tika";
int splitAt = embeddedName.lastIndexOf('.');
if (splitAt > 0) {
embeddedName.substring(splitAt);
}
File tmp = File.createTempFile("tika-embedded-", suffix);
wanted.put(embeddedName, tmp);
return tmp;
}
public Set getSupportedTypes(ParseContext context) {
return downstreamParser.getSupportedTypes(context);
}
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (name != null && wanted.containsKey(name)) {
FileOutputStream out = new FileOutputStream(wanted.get(name));
IOUtils.copy(stream, out);
out.close();
} else {
if (downstreamParser != null) {
downstreamParser.parse(stream, handler, metadata, context);
}
}
}
}
}