All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.tablesaw.io.Source Maven / Gradle / Ivy

There is a newer version: 0.43.1
Show newest version
package tech.tablesaw.io;

import com.google.common.annotations.VisibleForTesting;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Scanner;

public class Source {

  // we always have one of these (file, reader, or inputStream)
  protected final File file;
  protected final Reader reader;
  protected final InputStream inputStream;
  protected final Charset charset;

  public Source(File file) {
    this(file, getCharSet(file));
  }

  public Source(File file, Charset charset) {
    this.file = file;
    this.reader = null;
    this.inputStream = null;
    this.charset = charset;
  }

  public Source(Reader reader) {
    this.file = null;
    this.reader = reader;
    this.inputStream = null;
    this.charset = null;
  }

  public Source(InputStream inputStream) {
    this(inputStream, Charset.defaultCharset());
  }

  public Source(InputStream inputStream, Charset charset) {
    this.file = null;
    this.reader = null;
    this.inputStream = inputStream;
    this.charset = charset;
  }

  public static Source fromString(String s) {
    return new Source(new StringReader(s));
  }

  public static Source fromUrl(String url) throws IOException {
    return new Source(new StringReader(loadUrl(url)));
  }

  public File file() {
    return file;
  }

  public Reader reader() {
    return reader;
  }

  public InputStream inputStream() {
    return inputStream;
  }

  /**
   * If cachedBytes are not null, returns a Reader created from the cachedBytes. Otherwise, returns
   * a Reader from the underlying source.
   */
  public Reader createReader(byte[] cachedBytes) throws IOException {
    if (cachedBytes != null) {
      return new InputStreamReader(new ByteArrayInputStream(cachedBytes));
    }
    if (inputStream != null) {
      return new InputStreamReader(inputStream, charset);
    }
    if (reader != null) {
      return reader;
    }
    return new InputStreamReader(new FileInputStream(file), charset);
  }

  private static String loadUrl(String url) throws IOException {
    try (Scanner scanner = new Scanner(new URL(url).openStream())) {
      scanner.useDelimiter("\\A"); // start of a string
      return scanner.hasNext() ? scanner.next() : "";
    }
  }

  /**
   * Returns the likely charset for the given file, if it can be determined. A confidence score is
   * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
   * returned instead.
   *
   * @param file The file to be evaluated
   * @return The likely charset, or the system default charset
   */
  @VisibleForTesting
  static Charset getCharSet(File file) {
    long bufferSize = file.length() < 9999 ? file.length() : 9999;
    byte[] buffer = new byte[(int) bufferSize];
    try (InputStream initialStream = new FileInputStream(file)) {
      int bytesRead = initialStream.read(buffer);
      if (bytesRead < bufferSize) {
        throw new IOException("Was not able to read expected number of bytes");
      }
    } catch (IOException e) {
      throw new IllegalStateException(e);
    }
    return getCharSet(buffer);
  }

  /**
   * Returns the likely charset for the given byte[], if it can be determined. A confidence score is
   * calculated. If the score is less than 60 (on a 1 to 100 interval) the system default charset is
   * returned instead.
   *
   * @param buffer The byte array to evaluate
   * @return The likely charset, or the system default charset
   */
  private static Charset getCharSet(byte[] buffer) {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(buffer);
    CharsetMatch match = detector.detect();
    if (match == null || match.getConfidence() < 60) {
      return Charset.defaultCharset();
    }
    return Charset.forName(match.getName());
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy