All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.osgl.util.BigLines Maven / Gradle / Ivy

The newest version!
package org.osgl.util;

/*-
 * #%L
 * Java Tool
 * %%
 * Copyright (C) 2014 - 2018 OSGL (Open Source General Library)
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import org.osgl.OsglConfig;

import java.io.*;
import java.util.*;
import java.util.concurrent.ThreadLocalRandom;

/**
 * A help class provide utilities that read through text file with big
 * number of lines.
 *
 * It supports:
 *
 * 1. preview the first line
 * 2. get line numbers
 * 3. skip lines
 * 4. fetch certain number of lines
 */
public class BigLines implements Iterable {

    public abstract static class LineReader {

        public abstract void read(String line, int lineNo);
        public abstract void batchFinished();
    }

    private File file;

    private volatile Integer lines;
    private String firstLine;
    private boolean iterateFirstLine;

    public BigLines(File file) {
        E.illegalArgumentIfNot(file.exists() && file.isFile() && file.canRead(), "file must exists and be a readable file: " + file);
        this.file = file;
    }

    public String getName() {
        return file.getName();
    }

    public boolean isEmpty() {
        return 0 == lines();
    }

    public String firstLine() {
        if (null == lines) {
            synchronized (this) {
                if (null == lines) {
                    if (lines() > 0) {
                        firstLine = fetch(0);
                    }
                }
            }
        }
        return firstLine;
    }

    public int lines() {
        if (null == lines) {
            synchronized (this) {
                if (null == lines) {
                    lines = countLines();
                }
            }
        }
        return lines;
    }

    public void setIterateFirstLine(boolean flag) {
        this.iterateFirstLine = flag;
    }

    /**
     * Returns first 5 lines including header line.
     */
    public List preview() {
        return preview(5, false);
    }

    /**
     * Returns first `limit` lines including header line.
     *
     * @param limit
     *         the number of lines to be returned
     * @return the first `limit` lines
     */
    public List preview(int limit) {
        return preview(limit, false);
    }

    /**
     * Returns first `limit` lines.
     *
     * @param limit
     *         the number of lines to be returned
     * @param noHeaderLine
     *         if `false` then header line will be excluded in the return list
     * @return the first `limit` lines.
     */
    public List preview(int limit, boolean noHeaderLine) {
        E.illegalArgumentIf(limit < 1, "limit must be positive integer");
        return fetch(noHeaderLine ? 1 : 0, limit);
    }

    /**
     * Returns the line specified by `lineNumber`.
     *
     * Note the `lineNumber` starts with `0`.
     *
     * @param lineNumber
     *         specify the line to be returned.
     * @return the line as described above.
     */
    public String fetch(int lineNumber) {
        E.illegalArgumentIf(lineNumber < 0, "line number must not be negative number: " + lineNumber);
        E.illegalArgumentIf(lineNumber >= lines(), "line number is out of range: " + lineNumber);
        List list = fetch(lineNumber, 1);
        return list.isEmpty() ? null : list.get(0);
    }

    /**
     * Returns a number of lines specified by start position `offset` and `limit`.
     *
     * @param offset
     *         the start line number (`0` based)
     * @param limit
     *         the number of lines to be returned.
     * @return a number of lines as specified.
     */
    public List fetch(int offset, int limit) {
        return fetch(offset, limit, new ArrayList(limit));
    }

    private List fetch(int offset, int limit, List buf) {
        buf.clear();
        if (isEmpty()) {
            return buf;
        }
        E.illegalArgumentIf(offset < 0, "offset must not be negative number");
        E.illegalArgumentIf(offset >= lines(), "offset is out of range: " + offset);
        E.illegalArgumentIf(limit < 1, "limit must be at least 1");
        BufferedReader reader = IO.buffered(IO.reader(file));
        try {
            for (int i = 0; i < offset; ++i) {
                if (null == reader.readLine()) {
                    break;
                }
            }
        } catch (IOException e) {
            throw E.ioException(e);
        }
        try {
            for (int i = 0; i < limit; ++i) {
                String line = reader.readLine();
                if (null == line) {
                    break;
                }
                buf.add(line);
            }
        } catch (IOException e) {
            throw E.ioException(e);
        }
        return buf;
    }

    public List fetchAround(int lineNumber, int before, int after) {
        int offset = lineNumber - before;
        int limit = after - before;
        return fetch(offset, limit);
    }

    public List cherrypick(int[] index) {
        if (index.length < 1) {
            return C.list();
        }
        Arrays.sort(index);
        int len = index.length;
        BufferedReader reader = IO.buffered(IO.reader(file));
        List lines = new ArrayList<>();
        try {
            int max = index[len - 1] + 1;
            for (int i = 0; i < max; ++i) {
                String line = reader.readLine();
                if (null == line) {
                    break;
                }
                if (Arrays.binarySearch(index, i) > -1) {
                    lines.add(line);
                }
            }
        } catch (IOException e) {
            throw E.ioException(e);
        }
        return lines;
    }

    public List sampling(int number) {
        E.illegalArgumentIf(number < 1, "sample number must be positive integer");
        if (number > 1100) {
            number = 1100;
        }
        int[] index = new int[number];
        Random r = ThreadLocalRandom.current();
        int max = (lines > (long) Integer.MAX_VALUE) ? Integer.MAX_VALUE : lines.intValue();
        for (int i = 0; i < number; ++i) {
            index[i] = 1 + r.nextInt(max - 1);
        }
        return cherrypick(index);
    }

    public void accept(LineReader lineReader) {
        if (lines < 100 * 100 * 10) {
            int lineNo = 0;
            BufferedReader reader = IO.buffered(IO.reader(file));
            int max = lines;
            try {
                for (int i = 0; i < max; ++i) {
                    String line = reader.readLine();
                    if (null == line) {
                        break;
                    }
                    if (0 == i && !iterateFirstLine) {
                        continue;
                    }
                    lineReader.read(line, lineNo++);
                }
                lineReader.batchFinished();
            } catch (IOException e) {
                throw E.ioException(e);
            }
        } else {
            int threads = ((lines / 100 * 100 * 10) + 1);
            threads = Math.min(threads, 20);

            Integer gap = (lines / threads) + 1;
            List threadStore = new ArrayList<>();
            for (int i = 0; i < threads; ++i) {
                Thread t = new ReadThread(i * gap, gap, lineReader);
                threadStore.add(t);
                t.start();
            }
            for (Thread t : threadStore) {
                try {
                    t.join();
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    throw E.unexpected(e);
                }
            }
        }
    }

    private class ReadThread extends Thread {

        private Integer offset;
        private Integer limit;
        private LineReader lineReader;

        public ReadThread(Integer offset, Integer limit, LineReader lineReader) {
            this.offset = offset;
            this.limit = limit;
            this.lineReader = lineReader;
        }

        @Override
        public void run() {
            BufferedReader reader = IO.buffered(IO.reader(file));
            try {
                for (int i = 0; i < offset; ++i) {
                    if (null == reader.readLine()) {
                        break;
                    }
                }
            } catch (IOException e) {
                throw E.ioException(e);
            }
            try {
                int start = 0;
                int lineNo = start;
                for (int i = start; i < limit; ++i) {
                    String line = reader.readLine();
                    if (null == line) {
                        break;
                    }
                    if (0 == offset && 0 == i && !iterateFirstLine) {
                        continue;
                    }
                    lineReader.read(line, lineNo++);
                }
                lineReader.batchFinished();
            } catch (IOException e) {
                throw E.ioException(e);
            }
        }
    }

    class BigLinesIterator implements Iterator {
        private int bufSize;
        private List buf;
        private int offset;
        private int bufCursor;

        BigLinesIterator(int bufSize) {
            this.bufSize = bufSize;
            this.buf = fetch(offset, bufSize);
            this.offset = bufSize;
        }

        @Override
        public boolean hasNext() {
            return (offset - bufSize + bufCursor) < lines();
        }

        @Override
        public String next() {
            if (bufSize <= bufCursor) {
                fetch(this.offset, this.bufSize, this.buf);
                this.offset += this.bufSize;
                bufCursor = 0;
            }
            return buf.get(bufCursor++);
        }

        @Override
        public void remove() {
            throw E.unsupport();
        }
    }

    /**
     * This method is deprecated. Please use BigLines as an `Iterable` directly.
     */
    @Deprecated
    public Iterable asIterable(int bufSize) {
        return this;
    }

    @Override
    public Iterator iterator() {
        final BufferedReader br = IO.buffered(IO.reader(file));
        Iterator iter = new Iterator() {
            String nextLine = null;

            @Override
            public boolean hasNext() {
                if (nextLine != null) {
                    return true;
                } else {
                    try {
                        nextLine = br.readLine();
                        return (nextLine != null);
                    } catch (IOException e) {
                        throw E.ioException(e);
                    }
                }
            }

            @Override
            public String next() {
                if (nextLine != null || hasNext()) {
                    String line = nextLine;
                    nextLine = null;
                    return line;
                } else {
                    throw new NoSuchElementException();
                }
            }

            @Override
            public void remove() {
                throw E.unsupport();
            }
        };

        return iter;
    }

    // see https://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
    private int countLines() {
        InputStream is = IO.buffered(IO.inputStream(file));
        try {
            byte[] c = new byte[1024];

            int readChars = is.read(c);
            if (readChars == -1) {
                // bail out if nothing to read
                return 0;
            }

            // make it easy for the optimizer to tune this loop
            int count = 0;
            while (readChars == 1024) {
                for (int i = 0; i < 1024; ) {
                    if (c[i++] == '\n') {
                        ++count;
                    }
                }
                readChars = is.read(c);
            }

            // count remaining characters
            while (readChars != -1) {
                for (int i = 0; i < readChars; ++i) {
                    if (c[i] == '\n') {
                        ++count;
                    }
                }
                readChars = is.read(c);
            }

            return count;
        } catch (IOException e) {
            throw E.ioException(e);
        } finally {
            IO.close(is);
        }
    }

    public static void main(String[] args) {
        BigLines bigLines = new BigLines(new File("/tmp/1.csv"));
        System.out.println(bigLines.lines());

        System.out.println(bigLines.firstLine());

        List lines = bigLines.fetch(555554, 2);
        System.out.println(S.join("\n", lines));

        bigLines = new BigLines(new File("/tmp/2.txt"));
        System.out.println(bigLines.lines());
        for (String line : bigLines) {
            System.out.println(line);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy