org.apache.tika.eval.db.MimeBuffer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval.db;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Types;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.eval.AbstractProfiler;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
public class MimeBuffer extends AbstractDBBuffer {
private final PreparedStatement st;
private final TikaConfig config;
private final Connection connection;
public MimeBuffer(Connection connection, TikaConfig config) throws SQLException {
st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " +
Cols.MIME_ID.name() + ", " +
Cols.MIME_STRING.name() + ", " +
Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
this.config = config;
this.connection = connection;
}
@Override
public void write(int id, String value) throws RuntimeException {
try {
st.clearParameters();
st.setInt(1, id);
st.setString(2, value);
try {
String ext = MimeUtil.getExtension(value, config);
if (ext == null || ext.length() == 0) {
st.setNull(3, Types.VARCHAR);
} else {
st.setString(3, ext);
}
} catch (MimeTypeException e) {
st.setNull(3, Types.VARCHAR);
}
st.execute();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws SQLException {
st.close();
connection.commit();
connection.close();
}
private static class MimeUtil {
//TODO: see if MimeType now works for these
private static final String APPLICATION = "application";
private static final String TEXT = "text";
private static final String HTML = "html";
private static final String XML = "xml";
private static final String XHTML_XML = "xhtml+xml";
private static final String CSS = "css";
private static final String CSV = "csv";
private static final String PLAIN = "plain";
private static final String EMPTY_STRING = "";
/**
* Utility method to convert from a string value representing a content type
* (e.g. "application/pdf") into the most common extension for that file type
* (e.g. "pdf").
*
* This will has special handling for texty filetypes whose MimeTypes
* don't currently return anything for {@link MimeType#getExtension};
*
* @param contentType string representing a content type, for example: "application/pdf"
* @param config config from which to get MimeRepository
* @return extension or empty string
* @throws MimeTypeException thrown if MimeTypes can't parse the contentType
*/
public static String getExtension(String contentType, TikaConfig config)
throws MimeTypeException {
MimeTypes types = config.getMimeRepository();
MimeType mime = types.forName(contentType);
return getExtension(mime);
}
public static String getExtension(MimeType mime) {
String ext = mime.getExtension();
if (ext.startsWith(".")) {
ext = ext.substring(1);
}
//special handling for text/html/xml
if (ext.length() == 0) {
ext = tryTextyTypes(mime.getType());
}
return ext;
}
private static String tryTextyTypes(MediaType mediaType) {
String type = mediaType.getType();
String subtype = mediaType.getSubtype();
if (type.equals(TEXT)) {
if (subtype.equals(HTML)) {
return HTML;
} else if (subtype.equals(PLAIN)) {
return "txt";
} else if (subtype.equals(CSS)) {
return CSS;
} else if (subtype.equals(CSV)) {
return CSV;
}
} else if (type.equals(APPLICATION)) {
if (subtype.equals(XML)) {
return XML;
} else if (subtype.equals(XHTML_XML)) {
return "html";
}
}
return EMPTY_STRING;
}
}
}