All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.tudarmstadt.ukp.wikipedia.timemachine.domain.TimeMachineGenerator Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.timemachine.domain;

import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;

import de.tudarmstadt.ukp.wikipedia.wikimachine.domain.AbstractSnapshotGenerator;
import de.tudarmstadt.ukp.wikipedia.wikimachine.domain.Files;
import de.tudarmstadt.ukp.wikipedia.wikimachine.domain.MetaData;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.sql.CategorylinksParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.sql.PagelinksParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.version.IDumpVersion;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.DumpTableEnum;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.DumpTableInputStream;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.PageParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.RevisionParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.TextParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.factory.IEnvironmentFactory;
import de.tudarmstadt.ukp.wikipedia.wikimachine.util.TimestampUtil;

/**
 * Generate dumps as .txt files for the JWPL database from given MediaWiki dump
 * files.
* By specifying a 'from' and a 'to' time stamps and the number of days to take * as interval
* this class produces multiple dump versions. * * */ public class TimeMachineGenerator extends AbstractSnapshotGenerator { private IDumpVersion versions[] = null; private TimeMachineFiles initialFiles = null; public TimeMachineGenerator(IEnvironmentFactory environmentFactory) { super(environmentFactory); } @Override public void setFiles(Files files) { initialFiles = (TimeMachineFiles) files; } private Integer calculateSnapshotsCount(Timestamp from, Timestamp to, Integer dayInterval) { Integer result = 0; for (Timestamp i = from; i.before(to); i = TimestampUtil .getNextTimestamp(i, dayInterval)) { result++; } return result; } @Override public void start() throws Exception { Timestamp fromTimestamp = configuration.getFromTimestamp(); Timestamp toTimestamp = configuration.getToTimestamp(); Integer each = configuration.getEach(); Integer snapshotsCount = fromTimestamp.equals(toTimestamp) ? 1 : calculateSnapshotsCount(fromTimestamp, toTimestamp, each); if (snapshotsCount > 0) { versions = new IDumpVersion[snapshotsCount]; logger.log("Dumps to be generated:"); for (int i = 0; i < snapshotsCount; i++) { Timestamp currentTimestamp = TimestampUtil.getNextTimestamp( fromTimestamp, i * each); logger.log(currentTimestamp); MetaData commonMetaData = MetaData .initWithConfig(configuration); commonMetaData.setTimestamp(currentTimestamp); IDumpVersion version = environmentFactory.getDumpVersion(); version.initialize(currentTimestamp); version.setMetaData(commonMetaData); TimeMachineFiles currentFiles = new TimeMachineFiles( initialFiles); currentFiles.setTimestamp(currentTimestamp); version.setFiles(currentFiles); versions[i] = version; } processInputDumps(); } else { logger.log("No timestamps."); } } private void processInputDumps() throws IOException { dumpVersionProcessor.setDumpVersions(versions); logger.log("Processing the revision table"); dumpVersionProcessor.processRevision(createRevisionParser()); logger.log("Processing the page table"); dumpVersionProcessor.processPage(createPageParser()); logger.log("Processing the categorylinks table"); dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); logger.log("Processing the pagelinks table"); dumpVersionProcessor.processPagelinks(createPagelinksParser()); logger.log("Processing the text table"); dumpVersionProcessor.processText(createTextParser()); logger.log("Writing meta data"); dumpVersionProcessor.writeMetaData(); } private RevisionParser createRevisionParser() throws IOException { String metahistory = initialFiles.getMetaHistoryFile(); DumpTableInputStream revisionTableInputStream = environmentFactory .getDumpTableInputStream(); revisionTableInputStream.initialize(decompressor .getInputStream(metahistory), DumpTableEnum.REVISION); RevisionParser revisionParser = environmentFactory.getRevisionParser(); revisionParser.setInputStream(revisionTableInputStream); return revisionParser; } private PageParser createPageParser() throws IOException { String metahistory = initialFiles.getMetaHistoryFile(); DumpTableInputStream pageTableInputStream = environmentFactory .getDumpTableInputStream(); pageTableInputStream.initialize(decompressor .getInputStream(metahistory), DumpTableEnum.PAGE); PageParser pageParser = environmentFactory.getPageParser(); pageParser.setInputStream(pageTableInputStream); return pageParser; } private CategorylinksParser createCategorylinksParser() throws IOException { String categorylinks = initialFiles.getCategoryLinksFile(); InputStream categorylinksStream = decompressor .getInputStream(categorylinks); return new CategorylinksParser(categorylinksStream); } private PagelinksParser createPagelinksParser() throws IOException { String pagelinks = initialFiles.getPageLinksFile(); InputStream pagelinksStream = decompressor.getInputStream(pagelinks); return new PagelinksParser(pagelinksStream); } private TextParser createTextParser() throws IOException { String metahistory = initialFiles.getMetaHistoryFile(); DumpTableInputStream textTableIntputStream = environmentFactory .getDumpTableInputStream(); textTableIntputStream.initialize(decompressor .getInputStream(metahistory), DumpTableEnum.TEXT); TextParser textParser = environmentFactory.getTextParser(); textParser.setInputStream(textTableIntputStream); return textParser; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy