
com.simiacryptus.util.test.WikiArticle Maven / Gradle / Ivy
/*
* Copyright (c) 2019 by Andrew Charneski.
*
* The author licenses this file to you under the
* Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance
* with the License. You may obtain a copy
* of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.simiacryptus.util.test;
import com.simiacryptus.ref.wrappers.RefList;
import com.simiacryptus.util.Util;
import com.simiacryptus.util.io.DataLoader;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import java.util.concurrent.atomic.AtomicInteger;
public class WikiArticle extends TestDocument {
@Nonnull
public static WikiDataLoader ENGLISH = new WikiDataLoader(
URI.create("https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"), 10000);
@Nonnull
public static WikiDataLoader GERMAN = new WikiDataLoader(
URI.create("https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"), 10000);
@Nonnull
public static WikiDataLoader FRENCH = new WikiDataLoader(
URI.create("https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2"), 10000);
public WikiArticle(String title, String text) {
super(title, text);
}
public static class WikiDataLoader extends DataLoader {
protected final String url;
protected final String file;
protected final int articleLimit;
public WikiDataLoader(@Nonnull URI uri, int articleLimit) {
super();
this.url = uri.toString();
this.articleLimit = articleLimit;
String path = uri.getPath();
String[] split = path.split("/");
file = split[split.length - 1];
}
@Override
protected void read(@Nonnull RefList queue) {
try {
try (final InputStream in = new BZip2CompressorInputStream(Util.cacheLocal(file, new URI(url)), true)) {
final SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setNamespaceAware(false);
final SAXParser saxParser = spf.newSAXParser();
saxParser.parse(in, new DefaultHandler() {
@Nonnull
Stack prefix = new Stack();
@Nonnull
Stack
© 2015 - 2025 Weber Informatics LLC | Privacy Policy