com.simiacryptus.util.test.WikiArticle Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of char-trie Show documentation
Show all versions of char-trie Show documentation
Optimized Character Trie Implemented in Java 8
/*
* Copyright (c) 2018 by Andrew Charneski.
*
* The author licenses this file to you under the
* Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance
* with the License. You may obtain a copy
* of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.simiacryptus.util.test;
import com.simiacryptus.util.Util;
import com.simiacryptus.util.io.DataLoader;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.concurrent.atomic.AtomicInteger;
/**
* The type Wiki article.
*/
public class WikiArticle extends TestDocument {
/**
* The constant ENGLISH.
*/
public static WikiDataLoader ENGLISH = new WikiDataLoader(URI.create(
"https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"), 10000);
/**
* The constant GERMAN.
*/
public static WikiDataLoader GERMAN = new WikiDataLoader(URI.create(
"https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"), 10000);
/**
* The constant FRENCH.
*/
public static WikiDataLoader FRENCH = new WikiDataLoader(URI.create(
"https://dumps.wikimedia.org/frwiki/latest/frwiki-latest-pages-articles.xml.bz2"), 10000);
/**
* Instantiates a new Wiki article.
*
* @param title the title
* @param text the text
*/
public WikiArticle(String title, String text) {
super(title, text);
}
/**
* The type Wiki data loader.
*/
public static class WikiDataLoader extends DataLoader {
/**
* The Url.
*/
protected final String url;
/**
* The File.
*/
protected final String file;
/**
* The Article limit.
*/
protected final int articleLimit;
/**
* Instantiates a new Wiki data loader.
*
* @param uri the uri
* @param articleLimit the article limit
*/
public WikiDataLoader(URI uri, int articleLimit) {
super();
this.url = uri.toString();
this.articleLimit = articleLimit;
String path = uri.getPath();
String[] split = path.split("/");
file = split[split.length - 1];
}
@Override
protected void read(List queue) {
try {
try (final InputStream in = new BZip2CompressorInputStream(Util.cacheLocal(file, new URI(url)), true)) {
final SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setNamespaceAware(false);
final SAXParser saxParser = spf.newSAXParser();
saxParser.parse(in, new DefaultHandler() {
Stack prefix = new Stack();
Stack
© 2015 - 2025 Weber Informatics LLC | Privacy Policy