All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.neo4j.csv.reader.Readables Maven / Gradle / Ivy

There is a newer version: 3.1.0-M03
Show newest version
/*
 * Copyright (c) 2002-2016 "Neo Technology,"
 * Network Engine for Objects in Lund AB [http://neotechnology.com]
 *
 * This file is part of Neo4j.
 *
 * Neo4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package org.neo4j.csv.reader;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.neo4j.collection.RawIterator;
import org.neo4j.function.IOFunction;
import org.neo4j.function.IOFunctions;
import org.neo4j.function.ThrowingFunction;

/**
 * Means of instantiating common {@link CharReadable} instances.
 *
 * There are support for compressed files as well for those methods accepting a {@link File} argument.
 * 
    *
  1. ZIP: is both an archive and a compression format. In many cases the order of files * is important and for a ZIP archive with multiple files, the order of the files are whatever the order * set by the tool that created the ZIP archive. Therefore only single-file-zip files are supported. * The single file in the given ZIP archive will be decompressed on the fly, while reading.
  2. *
  3. GZIP: is only a compression format and so will be decompressed on the fly, while reading.
  4. *
*/ public class Readables { private Readables() { throw new AssertionError( "No instances allowed" ); } public static final CharReadable EMPTY = new CharReadable.Adapter() { @Override public SectionedCharBuffer read( SectionedCharBuffer buffer, int from ) throws IOException { return buffer; } @Override public void close() throws IOException { // Nothing to close } @Override public String sourceDescription() { return "EMPTY"; } }; public static CharReadable wrap( final InputStream stream, final String sourceName, Charset charset ) throws IOException { byte[] bytes = new byte[Magic.longest()]; PushbackInputStream pushbackStream = new PushbackInputStream( stream, bytes.length ); Charset usedCharset = charset; int read = stream.read( bytes ); if ( read >= 0 ) { bytes = read < bytes.length ? Arrays.copyOf( bytes, read ) : bytes; Magic magic = Magic.of( bytes ); int excessiveBytes = read; if ( magic.impliesEncoding() ) { // Unread the diff between the BOM and the longest magic we gathered bytes for excessiveBytes -= magic.length(); usedCharset = magic.encoding(); } pushbackStream.unread( bytes, read - excessiveBytes, excessiveBytes ); } return wrap( new InputStreamReader( pushbackStream, usedCharset ) { @Override public String toString() { return sourceName; } } ); } /** * Remember that the {@link Reader#toString()} must provide a description of the data source. */ public static CharReadable wrap( final Reader reader ) { return new CharReadable.Adapter() { private long position; private final String sourceDescription = reader.toString(); @Override public SectionedCharBuffer read( SectionedCharBuffer buffer, int from ) throws IOException { buffer.compact( buffer, from ); buffer.readFrom( reader ); position += buffer.available(); return buffer; } @Override public void close() throws IOException { reader.close(); } @Override public long position() { return position; } @Override public String sourceDescription() { return sourceDescription; } }; } private static class FromFile implements IOFunction { private final Charset charset; FromFile( Charset charset ) { this.charset = charset; } @Override public Reader apply( final File file ) throws IOException { Magic magic = Magic.of( file ); if ( magic == Magic.ZIP ) { // ZIP file ZipFile zipFile = new ZipFile( file ); ZipEntry entry = getSingleSuitableEntry( zipFile ); return new InputStreamReader( zipFile.getInputStream( entry ), charset ) { @Override public String toString() { return file.getPath(); } }; } else if ( magic == Magic.GZIP ) { // GZIP file. GZIP isn't an archive like ZIP, so this is purely data that is compressed. // Although a very common way of compressing with GZIP is to use TAR which can combine many // files into one blob, which is then compressed. If that's the case then // the data will look like garbage and the reader will fail for whatever it will be used for. // TODO add tar support GZIPInputStream zipStream = new GZIPInputStream( new FileInputStream( file ) ); return new InputStreamReader( zipStream, charset ) { @Override public String toString() { return file.getPath(); } }; } else { InputStream in = new FileInputStream( file ); Charset usedCharset = this.charset; if ( magic.impliesEncoding() ) { // Read (and skip) the magic (BOM in this case) from the file we're returning out in.skip( magic.length() ); usedCharset = magic.encoding(); } return new InputStreamReader( in, usedCharset ) { @Override public String toString() { return file.getPath(); } }; } } private ZipEntry getSingleSuitableEntry( ZipFile zipFile ) throws IOException { List unsuitableEntries = new ArrayList<>(); Enumeration enumeration = zipFile.entries(); ZipEntry found = null; while ( enumeration.hasMoreElements() ) { ZipEntry entry = enumeration.nextElement(); if ( entry.isDirectory() || invalidZipEntry( entry.getName() ) ) { unsuitableEntries.add( entry.getName() ); continue; } if ( found != null ) { throw new IOException( "Multiple suitable files found in zip file " + zipFile.getName() + ", at least " + found.getName() + " and " + entry.getName() + ". Only a single file per zip file is supported" ); } found = entry; } if ( found == null ) { throw new IOException( "No suitable file found in zip file " + zipFile.getName() + "." + (!unsuitableEntries.isEmpty() ? " Although found these unsuitable entries " + unsuitableEntries : "" ) ); } return found; } } private static boolean invalidZipEntry( String name ) { return name.contains( "__MACOSX" ) || name.startsWith( "." ) || name.contains( "/." ); } public static CharReadable files( Charset charset, File... files ) throws IOException { IOFunction opener = new FromFile( charset ); switch ( files.length ) { case 0: return EMPTY; case 1: return wrap( opener.apply( files[0] ) ); default: return new MultiReadable( iterator( files, opener ) ); } } public static CharReadable sources( Reader... sources ) throws IOException { return new MultiReadable( iterator( sources, IOFunctions.identity() ) ); } public static CharReadable sources( RawIterator sources ) throws IOException { return new MultiReadable( sources ); } private static RawIterator iterator( final IN[] items, final ThrowingFunction converter ) { if ( items.length == 0 ) { throw new IllegalStateException( "No source items specified" ); } return new RawIterator() { private int cursor; @Override public boolean hasNext() { return cursor < items.length; } @Override public OUT next() throws IOException { if ( !hasNext() ) { throw new IllegalStateException(); } return converter.apply( items[cursor++] ); } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy