All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.simiacryptus.util.test.WikiArticle Maven / Gradle / Ivy

/*
 * Copyright (c) 2019 by Andrew Charneski.
 *
 * The author licenses this file to you under the
 * Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance
 * with the License.  You may obtain a copy
 * of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.simiacryptus.util.test;

import com.simiacryptus.ref.wrappers.RefList;
import com.simiacryptus.util.Util;
import com.simiacryptus.util.io.DataLoader;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import java.util.concurrent.atomic.AtomicInteger;

public class WikiArticle extends TestDocument {

  @Nonnull
  public static WikiDataLoader ENGLISH = new WikiDataLoader(
      URI.create("https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"), 10000);
  @Nonnull
  public static WikiDataLoader GERMAN = new WikiDataLoader(
      URI.create("https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"), 10000);
  @Nonnull
  public static WikiDataLoader FRENCH = new WikiDataLoader(
      URI.create("https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2"), 10000);

  public WikiArticle(String title, String text) {
    super(title, text);
  }

  public static class WikiDataLoader extends DataLoader {
    protected final String url;
    protected final String file;
    protected final int articleLimit;

    public WikiDataLoader(@Nonnull URI uri, int articleLimit) {
      super();
      this.url = uri.toString();
      this.articleLimit = articleLimit;
      String path = uri.getPath();
      String[] split = path.split("/");
      file = split[split.length - 1];
    }

    @Override
    protected void read(@Nonnull RefList queue) {
      try {
        try (final InputStream in = new BZip2CompressorInputStream(Util.cacheLocal(file, new URI(url)), true)) {
          final SAXParserFactory spf = SAXParserFactory.newInstance();
          spf.setNamespaceAware(false);
          final SAXParser saxParser = spf.newSAXParser();
          saxParser.parse(in, new DefaultHandler() {
            @Nonnull
            Stack prefix = new Stack();
            @Nonnull
            Stack> indexes = new Stack>();
            @Nonnull
            StringBuilder nodeString = new StringBuilder();
            @Nullable
            private String title;

            @Override
            public void characters(final char[] ch, final int start, final int length) throws SAXException {
              if (Thread.currentThread().isInterrupted()) {
                throw Util.throwException(new InterruptedException());
              }
              this.nodeString.append(ch, start, length);
              super.characters(ch, start, length);
            }

            @Override
            public void endDocument() throws SAXException {
              super.endDocument();
            }

            @Override
            public void endElement(final String uri, final String localName, final String qName) throws SAXException {
              if (Thread.currentThread().isInterrupted()) {
                throw Util.throwException(new InterruptedException());
              }
              final CharSequence pop = this.prefix.pop();
              this.indexes.pop();

              final int length = this.nodeString.length();
              String text = this.nodeString.toString().trim();
              this.nodeString = new StringBuilder();

              if ("page".equals(qName)) {
                this.title = null;
              } else if ("title".equals(qName)) {
                this.title = text;
              } else if ("text".equals(qName)) {
                //com.simiacryptus.ref.wrappers.System.p.println(String.format("Read #%s - %s", queue.size(), this.title));
                queue.add(new WikiArticle(this.title, text));
                if (queue.size() > articleLimit) {
                  throw Util.throwException(new InterruptedException());
                }
              }
              super.endElement(uri, localName, qName);
            }

            @Override
            public void startDocument() throws SAXException {
              super.startDocument();
            }

            @Override
            public void startElement(final String uri, final String localName, final String qName,
                                     final Attributes attributes) throws SAXException {
              if (Thread.currentThread().isInterrupted()) {
                throw Util.throwException(new InterruptedException());
              }
              int idx;
              if (0 < this.indexes.size()) {
                final Map index = this.indexes.peek();
                AtomicInteger cnt = index.get(qName);
                if (null == cnt) {
                  cnt = new AtomicInteger(-1);
                  index.put(qName, cnt);
                }
                idx = cnt.incrementAndGet();
              } else {
                idx = 0;
              }
              String path = 0 == this.prefix.size() ? qName : this.prefix.peek() + "/" + qName;
              if (0 < idx) {
                path += "[" + idx + "]";
              }
              this.prefix.push(path);
              this.indexes.push(new HashMap());
              super.startElement(uri, localName, qName, attributes);
            }
          }, null);
        }
      } catch (@Nonnull final RuntimeException e) {
        if (!(e.getCause() instanceof InterruptedException))
          e.printStackTrace();
      } catch (@Nonnull final Exception e) {
        e.printStackTrace();
      } finally {
        System.err.println("Read thread exit");
      }
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy