org.wikidata.wdtk.dumpfiles.wmf.WmfDumpFileManager Maven / Gradle / Ivy
Show all versions of wdtk-dumpfiles Show documentation
package org.wikidata.wdtk.dumpfiles.wmf;
/*
* #%L
* Wikidata Toolkit Dump File Handling
* %%
* Copyright (C) 2014 Wikidata Toolkit Developers
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.dumpfiles.DumpContentType;
import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
import org.wikidata.wdtk.dumpfiles.MwDumpFile;
import org.wikidata.wdtk.util.DirectoryManager;
import org.wikidata.wdtk.util.WebResourceFetcher;
import org.wikidata.wdtk.util.WebResourceFetcherImpl;
/**
* Class for providing access to available dumpfiles provided by the Wikimedia
* Foundation. The preferred access point for this class is
* {@link DumpProcessingController#processAllRecentRevisionDumps()}, since this
* method takes care of freeing resources and might also provide parallelized
* downloading/processing in the future.
*
* Typically, the Web will be accessed to find information about dumps available
* online. This Web access is mediated by a {@link WebResourceFetcherImpl}
* object, provided upon construction. If null is given instead, the class will
* operate in offline mode, using only previously downloaded files.
*
* The location of the Wikimedia download site is currently hardwired, since the
* extraction methods used to get the data are highly specific to the format of
* files on this site. Other sites (if any) would most likely need different
* methods.
*
* @author Markus Kroetzsch
*
*/
public class WmfDumpFileManager {
static final Logger logger = LoggerFactory
.getLogger(WmfDumpFileManager.class);
/**
* The regular expression that a date stamp should match.
*/
static final String DATE_STAMP_PATTERN = "\\d\\d\\d\\d\\d\\d\\d\\d";
/**
* The name of the directory where downloaded dump files are stored.
*/
public static final String DOWNLOAD_DIRECTORY_NAME = "dumpfiles";
final String projectName;
final DirectoryManager dumpfileDirectoryManager;
final WebResourceFetcher webResourceFetcher;
/**
* Constructor.
*
* @param projectName
* name of the project to obtain dumps for as used in the folder
* structure of the dump site, e.g., "wikidatawiki"
* @param downloadDirectoryManager
* the directory manager for the directory where the download
* directory for dump files should be; it will be created if
* needed
* @param webResourceFetcher
* the web resource fetcher to access web resources or null if no
* web access should happen
* @throws IOException
* if it was not possible to access the directory for managing
* dumpfiles
*/
public WmfDumpFileManager(String projectName,
DirectoryManager downloadDirectoryManager,
WebResourceFetcher webResourceFetcher) throws IOException {
this.projectName = projectName;
this.dumpfileDirectoryManager = downloadDirectoryManager
.getSubdirectoryManager(
WmfDumpFileManager.DOWNLOAD_DIRECTORY_NAME)
.getSubdirectoryManager(projectName);
this.webResourceFetcher = webResourceFetcher;
WmfDumpFileManager.logger.info("Using download directory "
+ this.dumpfileDirectoryManager.toString());
}
/**
* Finds all page revision dump files, online or locally, that are relevant
* to obtain the most current state of the data. Revision dump files are
* dumps that contain page revisions in MediaWiki's XML format.
*
* If the parameter preferCurrent is true, then dumps that contain
* only the current versions of all files will be preferred if available
* anywhere, even over previously downloaded dump files that contain all
* versions. However, dump files may still contain non-current revisions,
* and when processing multiple dumps there might even be overlaps (one
* revision occurring in multiple dumps).
*
* The result is ordered with the most recent dump first. If a dump file A
* contains revisions of a page P, and Rmax is the maximal revision of P in
* A, then every dump file that comes after A should contain only revisions
* of P that are smaller than or equal to Rmax. In other words, the maximal
* revision found in the first file that contains P at all should also be
* the maximal revision overall.
*
* @param preferCurrent
* should dumps with current revisions be preferred?
* @return an ordered list of all dump files that match the given criteria
*/
public List findAllRelevantRevisionDumps(boolean preferCurrent) {
MwDumpFile mainDump;
if (preferCurrent) {
mainDump = findMostRecentDump(DumpContentType.CURRENT);
} else {
mainDump = findMostRecentDump(DumpContentType.FULL);
}
if (mainDump == null) {
return findAllDumps(DumpContentType.DAILY);
}
List result = new ArrayList<>();
for (MwDumpFile dumpFile : findAllDumps(DumpContentType.DAILY)) {
if (dumpFile.getDateStamp().compareTo(mainDump.getDateStamp()) > 0) {
result.add(dumpFile);
}
}
result.add(mainDump);
if (logger.isInfoEnabled()) {
StringBuilder logMessage = new StringBuilder();
logMessage.append("Found ")
.append(result.size())
.append(" relevant dumps to process:");
for (MwDumpFile dumpFile : result) {
logMessage.append("\n * ").append(dumpFile.toString());
}
logger.info(logMessage.toString());
}
return result;
}
/**
* Finds the most recent dump of the given type that is actually available.
*
* @param dumpContentType
* the type of the dump to look for
* @return most recent main dump or null if no such dump exists
*/
public MwDumpFile findMostRecentDump(DumpContentType dumpContentType) {
List dumps = findAllDumps(dumpContentType);
for (MwDumpFile dump : dumps) {
if (dump.isAvailable()) {
return dump;
}
}
return null;
}
/**
* Returns a list of all dump files of the given type available either
* online or locally. For dumps available both online and locally, the local
* version is included. The list is ordered with most recent dump date
* first. Online dumps found by this method might not be available yet (if
* their directory has been created online but the file was not uploaded or
* completely written yet).
*
* @return a list of dump files of the given type
*/
public List findAllDumps(DumpContentType dumpContentType) {
List localDumps = findDumpsLocally(dumpContentType);
if (this.webResourceFetcher != null) {
List onlineDumps = findDumpsOnline(dumpContentType);
return mergeDumpLists(localDumps, onlineDumps);
} else {
return localDumps;
}
}
/**
* Merges a list of local and online dumps. For dumps available both online
* and locally, only the local version is included. The list is order with
* most recent dump date first.
*
* @return a merged list of dump files
*/
List mergeDumpLists(List localDumps,
List onlineDumps) {
List result = new ArrayList<>(localDumps);
HashSet localDateStamps = new HashSet<>();
for (MwDumpFile dumpFile : localDumps) {
localDateStamps.add(dumpFile.getDateStamp());
}
for (MwDumpFile dumpFile : onlineDumps) {
if (!localDateStamps.contains(dumpFile.getDateStamp())) {
result.add(dumpFile);
}
}
result.sort(Collections.reverseOrder(new MwDumpFile.DateComparator()));
return result;
}
/**
* Finds out which dump files of the given type have been downloaded
* already. The result is a list of objects that describe the available dump
* files, in descending order by their date. Not all of the dumps included
* might be actually available.
*
* @param dumpContentType
* the type of dump to consider
* @return list of objects that provide information on available dumps
*/
List findDumpsLocally(DumpContentType dumpContentType) {
String directoryPattern = WmfDumpFile.getDumpFileDirectoryName(
dumpContentType, "*");
List dumpFileDirectories;
try {
dumpFileDirectories = this.dumpfileDirectoryManager
.getSubdirectories(directoryPattern);
} catch (IOException e) {
logger.error("Unable to access dump directory: " + e.toString());
return Collections.emptyList();
}
List result = new ArrayList<>();
for (String directory : dumpFileDirectories) {
String dateStamp = WmfDumpFile
.getDateStampFromDumpFileDirectoryName(dumpContentType,
directory);
if (dateStamp.matches(WmfDumpFileManager.DATE_STAMP_PATTERN)) {
WmfLocalDumpFile dumpFile = new WmfLocalDumpFile(dateStamp,
this.projectName, dumpfileDirectoryManager,
dumpContentType);
if (dumpFile.isAvailable()) {
result.add(dumpFile);
} else {
logger.error("Incomplete local dump file data. Maybe delete "
+ dumpFile.getDumpfileDirectory()
+ " to attempt fresh download.");
}
} // else: silently ignore directories that don't match
}
result.sort(Collections.reverseOrder(new MwDumpFile.DateComparator()));
logger.info("Found " + result.size() + " local dumps of type "
+ dumpContentType + ": " + result);
return result;
}
/**
* Finds out which dump files of the given type are available for download.
* The result is a list of objects that describe the available dump files,
* in descending order by their date. Not all of the dumps included might be
* actually available.
*
* @return list of objects that provide information on available full dumps
*/
List findDumpsOnline(DumpContentType dumpContentType) {
List dumpFileDates = findDumpDatesOnline(dumpContentType);
List result = new ArrayList<>();
for (String dateStamp : dumpFileDates) {
if (dumpContentType == DumpContentType.DAILY) {
result.add(new WmfOnlineDailyDumpFile(dateStamp,
this.projectName, this.webResourceFetcher,
this.dumpfileDirectoryManager));
} else if (dumpContentType == DumpContentType.JSON) {
result.add(new JsonOnlineDumpFile(dateStamp, this.projectName,
this.webResourceFetcher, this.dumpfileDirectoryManager));
} else {
result.add(new WmfOnlineStandardDumpFile(dateStamp,
this.projectName, this.webResourceFetcher,
this.dumpfileDirectoryManager, dumpContentType));
}
}
logger.info("Found " + result.size() + " online dumps of type "
+ dumpContentType + ": " + result);
return result;
}
/**
* Finds out which dump files are available for download in a given
* directory. The result is a list of YYYYMMDD date stamps, ordered newest
* to oldest. The list is based on the directories or files found at the
* target location, without considering whether or not each dump is actually
* available.
*
* The implementation is rather uniform since all cases supported thus far
* use directory/file names that start with a date stamp. If the date would
* occur elsewhere or in another form, then more work would be needed.
*
* @param dumpContentType
* the type of dump to consider
* @return list of date stamps
*/
List findDumpDatesOnline(DumpContentType dumpContentType) {
List result = new ArrayList<>();
try (InputStream in = this.webResourceFetcher
.getInputStreamForUrl(WmfDumpFile.getDumpFileWebDirectory(
dumpContentType, this.projectName))) {
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(in, StandardCharsets.UTF_8));
String inputLine;
while ((inputLine = bufferedReader.readLine()) != null) {
String dateStamp = "";
if (inputLine.startsWith("")) {
// old format of HTML file lists
dateStamp = inputLine.substring(27, 35);
} else if (inputLine.startsWith("
© 2015 - 2024 Weber Informatics LLC | Privacy Policy