All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.pageview.PageViewReader Maven / Gradle / Ivy

package org.wikibrain.pageview;

import org.apache.commons.io.IOUtils;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.Title;
import org.wikibrain.utils.WpIOUtils;

import java.io.*;
import java.net.URLDecoder;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Shilad Sen
 * An iterator over the page views in a particular file.
 */
public class PageViewReader implements Iterable {


    private final LanguageSet langs;

    private static final Logger LOG = LoggerFactory.getLogger(PageViewReader.class);

    private final File path;

    public PageViewReader(File path, LanguageSet langs) {
        this.path = path;
        this.langs = langs;
        if (!path.isFile()) {
            throw new IllegalArgumentException("Page view file " + path + " does not exist");
        }
    }

    public PageViewIterator iterator() {
        try {
            return new PageViewIterator();
        } catch (IOException e) {
            throw new IllegalArgumentException(e);
        }
    }

    public class PageViewIterator implements Iterator {
        private AtomicInteger lines = new AtomicInteger();
        private AtomicInteger errors = new AtomicInteger();
        private AtomicInteger matches = new AtomicInteger();

        private BufferedReader reader;
        private RawPageView buffer = null;

        public PageViewIterator() throws IOException {
            this.reader = WpIOUtils.openBufferedReader(path);
        }

        @Override
        public synchronized boolean hasNext() {
            return fillBuffer();
        }

        @Override
        public synchronized RawPageView next() {
            if (!fillBuffer()) {
                return null;
            }
            RawPageView view = buffer;
            buffer = null;
            return view;
        }

        private synchronized boolean fillBuffer() {
            if (buffer != null) {
                return true;
            }
            if (reader == null) {
                return false;
            }
            while (buffer == null) {
                String line = "";
                try {
                    line = reader.readLine();
                    if (line == null) {
                        close();
                        break;
                    }
                    if (lines.incrementAndGet() % 1000000 == 0) {
                        LOG.info(String.format("File %s: lines=%d, errors=%d, matches=%d",
                                path, lines.get(), errors.get(), matches.get()));
                    }
                    String[] cols = line.split(" ");
                    if (cols.length < 3) {
                        LOG.info("Invalid pageview line: " + line);
                        continue;
                    }

                    Language lang;
                    try {
                        lang = Language.getByLangCode(cols[0]);
                    } catch (IllegalArgumentException e) {
                        continue;   // Not a wikipedia (e.g. Wiktionary)
                    }
                    if (langs.containsLanguage(lang)) {
                        String title = URLDecoder.decode(cols[1], "UTF-8");
                        buffer = new RawPageView(
                                null,
                                new Title(title, lang),
                                Integer.valueOf(cols[2]));
                        matches.incrementAndGet();
                    }
                } catch (IllegalArgumentException e) {
                    errors.incrementAndGet();
//                    LOG.log(Level.INFO, "Invalid pageview line: " + line, e);
                    // Invalid language, perhaps... just continue
                } catch (UnsupportedEncodingException e) {
                    errors.incrementAndGet();
//                    LOG.log(Level.INFO, "Invalid pageview line: " + line, e);
                    // Invalid language, perhaps... just continue
                } catch (IOException e) {
                    errors.incrementAndGet();
                    throw new RuntimeException(e);
                } catch (Exception e) {
                }
            }
            return buffer != null;
        }

        public synchronized void close() {
            if (reader != null) {
                IOUtils.closeQuietly(reader);
                reader = null;
            }
        }

        @Override
        public void remove() { throw new UnsupportedOperationException(); }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy