cn.tyoui.index.FileContent Maven / Gradle / Ivy
package cn.tyoui.index;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
/**
* 文件内容
*
* @author Tyoui
*/
public class FileContent {
/**
* 读取文件内容
*
* @param filePath 文件流
* @param parser 文件类型解释对象
* @return 文件内容
* @throws Exception 读取异常
*/
private static String wordExport(File filePath, Parser parser) throws Exception {
Metadata metadata = new Metadata();
ContentHandler contentHandler = new BodyContentHandler(1024 * 1024 * 100);
ParseContext parseContext = new ParseContext();
metadata.set(Metadata.RESOURCE_NAME_KEY, filePath.getName());
parseContext.set(Parser.class, parser);
InputStream inputStream = new FileInputStream(filePath);
parser.parse(inputStream, contentHandler, metadata, parseContext);
if (inputStream != null)
inputStream.close();
return contentHandler.toString();
}
/**
* 初始化文件
*
* @param file 文件类
* @return 文件内容
*/
public static String init(File file) {
String name = file.getName();
String suffix = name.substring(name.lastIndexOf(".") + 1);
String text = "txtdocxpdfxlsx";
String context = "";
if (text.contains(suffix)) {
try {
Parser parser = null;
if (suffix.equalsIgnoreCase("txt")) {
parser = new TXTParser();
} else {
parser = new AutoDetectParser();
}
context = wordExport(file, parser);
context = context.replaceAll("[<>{}|/%$#@~·`()??_&()+【】\t\n\r0-9A-Za-z,.!;,。;!“”::、 -]", "");
} catch (Exception e) {
}
}
return context;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy