All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.qiwenshare.ufop.util.ReadFileUtils Maven / Gradle / Ivy

There is a newer version: 1.3.4
Show newest version
package com.qiwenshare.ufop.util;

import com.qiwenshare.common.util.FileUtil;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
//import org.apache.poi.POIXMLTextExtractor;
//import org.apache.poi.hslf.extractor.PowerPointExtractor;
//import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
//import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
//import org.apache.poi.xslf.extractor.XSLFExtractor;
//import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
//import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 * @author wangshuaijun
 * 读取文件工具类:支持以下文件内容读取
 * 1. word(.doc),word(.docx)
 * 2. excel(.xls),excel(xlsx)
 * 3. pdf
 * 4. txt
 * 5. ppt(.ppt),pptx(,pptx)
 *
 */
public class ReadFileUtils {



    /**
     * 根据文件类型返回文件内容
     * @param fileurl 文件路径
     * @return 返回结果
     * @throws IOException IOException
     */
    public static String getContentByPath(String fileurl) throws IOException {
        FileInputStream fileInputStream = new FileInputStream(fileurl);
        String fileType = FileUtil.getFileExtendName(fileurl);
//        String[] fileTypeArr = filepath.split("\\.");
//        String fileType = fileTypeArr[fileTypeArr.length - 1];
        if ("doc".equals(fileType) || "docx".equals(fileType)) {
            return readWord(fileType, fileInputStream);
        } else if ("xlsx".equals(fileType) || "xls".equals(fileType)) {
            return readExcel(fileType, fileInputStream);
        } else if ("txt".equals(fileType)) {
            return readTxt(fileurl);
        } else if ("pdf".equals(fileType)) {
            return readPdf(fileInputStream);
        } else if ("ppt".equals(fileType) || "pptx".equals(fileType)) {
            return readPPT(fileType, fileInputStream);
        } else {
            System.out.println("不支持的文件类型!");
        }
        return "";
    }


    /**
     * 根据文件类型返回文件内容
     * @param fileType 文件类型
     * @param inputStream 输入流
     * @return 结果
     * @throws IOException io异常
     */
    public static String getContentByInputStream(String fileType, InputStream inputStream) throws IOException {
//        FileInputStream fileInputStream = new FileInputStream(filepath);
//        String[] fileTypeArr = filepath.split("\\.");
//        String fileType = fileTypeArr[fileTypeArr.length - 1];
        if ("doc".equals(fileType) || "docx".equals(fileType)) {
            return readWord(fileType, inputStream);
        } else if ("xlsx".equals(fileType) || "xls".equals(fileType)) {
            return readExcel(fileType, inputStream);
        } else if ("txt".equals(fileType)) {
//            return readTxt(filepath);
        } else if ("pdf".equals(fileType)) {
            return readPdf(inputStream);
        } else if ("ppt".equals(fileType) || "pptx".equals(fileType)) {
            return readPPT(fileType, inputStream);
        } else {
            System.out.println("不支持的文件类型!");
        }
        return "";
    }

    /**
     * 读取pdf内容
     * @param inputStream 输入流
     * @return 结果
     */
    public static String readPdf(InputStream inputStream) {
//        FileInputStream fileInputStream = null;
        PDDocument pdDocument = null;
        String content = "";
        try {
            //创建输入流对象
//            fileInputStream = new FileInputStream(filePath);
            //创建解析器对象
            PDFParser pdfParser = new PDFParser(new RandomAccessBuffer(inputStream));
            pdfParser.parse();
            //pdf文档
            pdDocument = pdfParser.getPDDocument();
            //pdf文本操作对象,使用该对象可以获取所读取pdf的一些信息
            PDFTextStripper pdfTextStripper = new PDFTextStripper();
            content = pdfTextStripper.getText(pdDocument);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                //PDDocument对象时使用完后必须要关闭
                if (null != pdDocument) {
                    pdDocument.close();
                }
                if (null != inputStream) {
                    inputStream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return content;
    }

    /**
     * 读取Excel中的内容
     *
     * @param filePath 文件路径
     * @return 返回结果
     * @throws IOException IOException
     */
    private static String readTxt(String filePath) throws IOException {
        File f = new File(filePath);
        return FileUtils.readFileToString(f, "GBK");
    }

    /**
     * 读取Excel中的内容
     * @param fileType 文件类型
     * @param inputStream 输入流
     * @return 结果
     */
    private static String readExcel(String fileType, InputStream inputStream) {

        try {
//            File excel = new File(filePath);
//            if (excel.isFile() && excel.exists()) {   //判断文件是否存在
                Workbook wb;
                //根据文件后缀(xls/xlsx)进行判断
                if ("xls".equalsIgnoreCase(fileType)) {
//                    FileInputStream fis = new FileInputStream(excel);   //文件流对象
                    wb = new HSSFWorkbook(inputStream);
                } else if ("xlsx".equalsIgnoreCase(fileType)) {
                    wb = new XSSFWorkbook(inputStream);
                } else {
                    System.out.println("文件类型错误!");
                    return "";
                }
                //开始解析,获取页签数
                StringBuffer sb = new StringBuffer("");
                for (int i = 0; i < wb.getNumberOfSheets(); i++) {
                    Sheet sheet = wb.getSheetAt(i);     //读取sheet
                    sb.append(sheet.getSheetName() + "_");
                    int firstRowIndex = sheet.getFirstRowNum() + 1;   //第一行是列名,所以不读
                    int lastRowIndex = sheet.getLastRowNum();
                    for (int rIndex = firstRowIndex; rIndex <= lastRowIndex; rIndex++) {   //遍历行
                        Row row = sheet.getRow(rIndex);
                        if (row != null) {
                            int firstCellIndex = row.getFirstCellNum();
                            int lastCellIndex = row.getLastCellNum();
                            for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) {   //遍历列
                                Cell cell = row.getCell(cIndex);
                                if (cell != null) {
                                    sb.append(cell.toString());
                                }
                            }
                        }
                    }
                }
                return sb.toString();
//            } else {
//                System.out.println("找不到指定的文件");
//            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    /**
     * 读取word
     * @param fileType 文件类型
     * @param inputStream 输入流
     * @return 结果
     */
    public static String readWord(String fileType, InputStream inputStream) {
        String buffer = "";
        try {
            if ("doc".equalsIgnoreCase(fileType)) {
//                InputStream is = new FileInputStream(new File(path));
                WordExtractor ex = new WordExtractor(inputStream);
                buffer = ex.getText();
                ex.close();
            } else if ("docx".equalsIgnoreCase(fileType)) {
//                OPCPackage opcPackage = POIXMLDocument.openPackage(path);
                XWPFWordExtractor extractor = new XWPFWordExtractor(OPCPackage.open(inputStream));
//                POIXMLTextExtractor extractor = new XWPFWordExtractor(OPCPackage.open(inputStream));
                buffer = extractor.getText();
                extractor.close();

            } else {
                System.out.println("此文件不是word文件!");
            }

        } catch (Exception e) {
            e.printStackTrace();
        }

        return buffer;
    }

    private static String readPPT(String fileType, InputStream inputStream) {
        try {
            if ("ppt".equalsIgnoreCase(fileType)) {
//                QuickButCruddyTextExtractor extractor = new QuickButCruddyTextExtractor(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(inputStream);
                return extractor.getText();
            } else if ("pptx".equalsIgnoreCase(fileType)) {
//                return new XSLFExtractor(new XMLSlideShow(OPCPackage.open(inputStream))).getText();
                return new XSLFPowerPointExtractor(OPCPackage.open(inputStream)).getText();
            }
        } catch (IOException e) {
            e.fillInStackTrace();
        } catch (OpenXML4JException e) {
            e.getMessage();
        } catch (XmlException e) {
            e.printStackTrace();
        }

        return "";
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy