All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.undercouch.citeproc.BibliographyFileReader Maven / Gradle / Ivy

package de.undercouch.citeproc;

import de.undercouch.citeproc.bibtex.BibTeXConverter;
import de.undercouch.citeproc.bibtex.BibTeXItemDataProvider;
import de.undercouch.citeproc.csl.CSLItemData;
import de.undercouch.citeproc.endnote.EndNoteConverter;
import de.undercouch.citeproc.endnote.EndNoteItemDataProvider;
import de.undercouch.citeproc.endnote.EndNoteLibrary;
import de.undercouch.citeproc.helper.json.JsonLexer;
import de.undercouch.citeproc.helper.json.JsonParser;
import de.undercouch.citeproc.ris.RISConverter;
import de.undercouch.citeproc.ris.RISItemDataProvider;
import de.undercouch.citeproc.ris.RISLibrary;
import org.jbibtex.BibTeXDatabase;
import org.jbibtex.ParseException;
import org.yaml.snakeyaml.Yaml;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;

/**
 * A convenience class providing methods to load any supported kind of
 * bibliography files. The class automatically detects the correct file
 * format and returns a {@link ItemDataProvider} that holds all
 * bibliography items read from the file.
 * @author Michel Kraemer
 */
public class BibliographyFileReader {
    /**
     * Supported file formats for bibliography files
     */
    public enum FileFormat {
        /**
         * A BibTeX file
         */
        BIBTEX,

        /**
         * A CSL JSON object
         */
        JSON_OBJECT,

        /**
         * An array of CSL JSON objects
         */
        JSON_ARRAY,

        /**
         * A YAML document
         * @since 1.1.0
         */
        YAML,

        /**
         * An EndNote file
         */
        ENDNOTE,

        /**
         * An RIS file
         */
        RIS,

        /**
         * Unknown file format
         */
        UNKNOWN
    }

    /**
     * Reads all items from an input bibliography file and returns a provider
     * serving these items
     * @param bibfile the input file
     * @return the provider
     * @throws FileNotFoundException if the input file was not found
     * @throws IOException if the input file could not be read
     */
    public ItemDataProvider readBibliographyFile(File bibfile)
            throws FileNotFoundException, IOException {
        // open buffered input stream to bibliography file
        if (!bibfile.exists()) {
            throw new FileNotFoundException("Bibliography file `" +
                    bibfile.getName() + "' does not exist");
        }
        try (BufferedInputStream bis = new BufferedInputStream(
                new FileInputStream(bibfile))) {
            return readBibliographyFile(bis, bibfile.getName());
        }
    }

    /**
     * Reads all items from an input stream and returns a provider
     * serving these items. Note that you can supply an additional file
     * name to help the method to determine the exact bibliography file format.
     * If you don't know the file name you can pass null, but in this case the
     * method's result might try to read the input stream using the wrong
     * file format (depending on the input stream's contents). Also note
     * that the caller is responsible for closing the given input stream.
     * @param bibstream the input stream
     * @param filename the name of the input file (can be null if you don't
     * know the name)
     * @return the provider
     * @throws IOException if the input stream could not be read
     */
    public ItemDataProvider readBibliographyFile(InputStream bibstream,
            String filename) throws IOException {
        BufferedInputStream bis;
        if (bibstream instanceof BufferedInputStream) {
            bis = (BufferedInputStream)bibstream;
        } else {
            bis = new BufferedInputStream(bibstream);
        }

        // determine file format
        FileFormat ff = determineFileFormat(bis, filename);

        // read stream
        return readBibliographyFile(bis, ff);
    }

    /**
     * Reads all items from an input stream using the given file format and
     * returns a provider serving these items.
     * @param bibstream the input stream
     * @param format the bibliography file format
     * @return the provider
     * @throws IOException if the input stream could not be read
     */
    public ItemDataProvider readBibliographyFile(InputStream bibstream,
            FileFormat format) throws IOException {
        ItemDataProvider provider;
        try {
            // load bibliography file
            if (format == FileFormat.BIBTEX) {
                BibTeXDatabase db = new BibTeXConverter().loadDatabase(bibstream);
                BibTeXItemDataProvider bibtexprovider = new BibTeXItemDataProvider();
                bibtexprovider.addDatabase(db);
                provider = bibtexprovider;
            } else if (format == FileFormat.JSON_ARRAY ||
                    format == FileFormat.JSON_OBJECT) {
                JsonParser parser = new JsonParser(new JsonLexer(
                        new InputStreamReader(bibstream, StandardCharsets.UTF_8)));
                List objs;
                if (format == FileFormat.JSON_ARRAY) {
                    objs = parser.parseArray();
                } else {
                    objs = new ArrayList<>();
                    objs.add(parser.parseObject());
                }
                CSLItemData[] items = new CSLItemData[objs.size()];
                for (int i = 0; i < items.length; ++i) {
                    @SuppressWarnings("unchecked")
                    Map obj = (Map)objs.get(i);
                    items[i] = CSLItemData.fromJson(obj);
                }
                provider = new ListItemDataProvider(items);
            } else if (format == FileFormat.YAML) {
                Yaml yaml = new Yaml();
                Iterable documentsIterable = yaml.loadAll(bibstream);
                List> documents = new ArrayList<>();
                documentsIterable.forEach(o -> {
                    if (o instanceof Map) {
                        documents.add(Collections.singletonList(o));
                    } else {
                        documents.add(new ArrayList<>((Collection)o));
                    }
                });
                List providers = new ArrayList<>();
                for (List objs : documents) {
                    CSLItemData[] items = new CSLItemData[objs.size()];
                    for (int i = 0; i < items.length; ++i) {
                        @SuppressWarnings("unchecked")
                        Map obj = (Map)objs.get(i);
                        items[i] = CSLItemData.fromJson(obj);
                    }
                    ItemDataProvider p = new ListItemDataProvider(items);
                    providers.add(p);
                }
                if (providers.size() == 1) {
                    provider = providers.get(0);
                } else {
                    provider = new CompoundItemDataProvider(providers);
                }
            } else if (format == FileFormat.ENDNOTE) {
                EndNoteLibrary lib = new EndNoteConverter().loadLibrary(bibstream);
                EndNoteItemDataProvider endnoteprovider = new EndNoteItemDataProvider();
                endnoteprovider.addLibrary(lib);
                provider = endnoteprovider;
            } else if (format == FileFormat.RIS) {
                RISLibrary lib = new RISConverter().loadLibrary(bibstream);
                RISItemDataProvider risprovider = new RISItemDataProvider();
                risprovider.addLibrary(lib);
                provider = risprovider;
            } else {
                throw new IOException("Unknown bibliography file format");
            }
        } catch (ParseException e) {
            throw new IOException("Could not parse bibliography file", e);
        }

        return provider;
    }

    /**
     * Reads the first 100 KB of the given bibliography file and tries
     * to determine the file format
     * @param bibfile the input file
     * @return the file format (or {@link FileFormat#UNKNOWN} if the format
     * could not be determined)
     * @throws FileNotFoundException if the input file was not found
     * @throws IOException if the input file could not be read
     */
    public FileFormat determineFileFormat(File bibfile)
            throws FileNotFoundException, IOException {
        if (!bibfile.exists()) {
            throw new FileNotFoundException("Bibliography file `" +
                    bibfile.getName() + "' does not exist");
        }
        try (BufferedInputStream bis = new BufferedInputStream(
                new FileInputStream(bibfile))) {
            return determineFileFormat(bis, bibfile.getName());
        }
    }

    /**
     * Reads the first bytes of the given input stream and tries to
     * determine the file format. Resets the input stream to the position
     * it had when the method was called. Reads up to 100 KB and before
     * giving up. Note that you can supply an additional file name to help
     * the method to determine the exact file format. If you don't know the
     * file name you can pass null, but in this case the method's result
     * might be wrong (depending on the input stream's contents).
     * @param bis a buffered input stream that supports the mark and reset
     * methods
     * @param filename the name of the input file (can be null if you don't
     * know the name)
     * @return the file format (or {@link FileFormat#UNKNOWN} if the format
     * could not be determined)
     * @throws IOException if the input stream could not be read
     */
    public FileFormat determineFileFormat(BufferedInputStream bis,
            String filename) throws IOException {
        int len = 1024 * 100;

        String ext = "";
        if (filename != null) {
            int dot = filename.lastIndexOf('.');
            if (dot > 0) {
                ext = filename.substring(dot + 1);
            }
        }

        // check the first couple of bytes
        bis.mark(len);
        try {
            byte[] firstCharacters = new byte[6];
            bis.read(firstCharacters);

            // check if the file starts with a %YAML directive
            if (firstCharacters[0] == '%' &&
                    firstCharacters[1] == 'Y' &&
                    firstCharacters[2] == 'A' &&
                    firstCharacters[3] == 'M' &&
                    firstCharacters[4] == 'L' &&
                    Character.isWhitespace(firstCharacters[5])) {
                return FileFormat.YAML;
            }

            // check if the file starts with an EndNote tag, but
            // also make sure the extension is not 'bib' or 'yml'/'yaml'
            // because BibTeX comments and YAML directives look like
            // EndNote tags
            if (firstCharacters[0] == '%' &&
                    Character.isWhitespace(firstCharacters[2]) &&
                    !ext.equalsIgnoreCase("bib") &&
                    !ext.equalsIgnoreCase("yaml") &&
                    !ext.equalsIgnoreCase("yml")) {
                return FileFormat.ENDNOTE;
            }

            // check if the file starts with a RIS type tag
            if (firstCharacters[0] == 'T' && firstCharacters[1] == 'Y' &&
                    Character.isWhitespace(firstCharacters[2]) &&
                    Character.isWhitespace(firstCharacters[3]) &&
                    firstCharacters[4] == '-') {
                return FileFormat.RIS;
            }
        } finally {
            bis.reset();
        }

        // now check if it's json, bibtex or yaml
        bis.mark(len);
        try {
            while (true) {
                int c = bis.read();
                --len;
                if (c < 0 || len < 2) {
                    return FileFormat.UNKNOWN;
                }

                if (!Character.isWhitespace(c)) {
                    if (c == '[') {
                        return FileFormat.JSON_ARRAY;
                    } else if (c == '{') {
                        return FileFormat.JSON_OBJECT;
                    }
                    if (ext.equalsIgnoreCase("yaml") ||
                            ext.equalsIgnoreCase("yml")) {
                        return FileFormat.YAML;
                    }
                    return FileFormat.BIBTEX;
                }
            }
        } finally {
            bis.reset();
        }
    }
}