All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.tudarmstadt.ukp.wikipedia.datamachine.dump.version.SingleDumpVersionOriginal Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.datamachine.dump.version;

import java.io.File;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import de.tudarmstadt.ukp.wikipedia.wikimachine.debug.ILogger;
import de.tudarmstadt.ukp.wikipedia.wikimachine.domain.Files;
import de.tudarmstadt.ukp.wikipedia.wikimachine.domain.MetaData;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.sql.CategorylinksParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.sql.PagelinksParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.version.IDumpVersion;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.PageParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.RevisionParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.dump.xml.TextParser;
import de.tudarmstadt.ukp.wikipedia.wikimachine.util.Redirects;
import de.tudarmstadt.ukp.wikipedia.wikimachine.util.TxtFileWriter;

/**
 * Transforms a database from mediawiki format to JWPL format.
* The transformation produces .txt files for the different tables
* in the JWPL database.
*
* * Adopted to IDumpVersion by Galkin * * * */ public class SingleDumpVersionOriginal implements IDumpVersion { // metadata private String language; private String mainCategory; private String disambiguationsCategory; // statistics private int nrOfDisambiguations = 0; private int nrOfPages = 0; private int nrOfCategories = 0; private int nrOfRedirects = 0; private Map pPageIdNameMap;// maps page id's of pages to // their names private Map cPageIdNameMap;// maps page id's of categories // to their names private Map pNamePageIdMap;// maps names of pages to their // page id's. private Map cNamePageIdMap;// maps names of categories to // their page id's. private Map rPageIdNameMap;// maps page id's of redirects // to their names. private Set disambiguations; // caches the page id's of // disambiguation pages. private Map textIdPageIdMap;// maps text id's to the page // id's. // galkin: moved from local variables to fields private TxtFileWriter txtFW; private TxtFileWriter pageCategories; private TxtFileWriter categoryPages; private TxtFileWriter categoryInlinks; private TxtFileWriter categoryOutlinks; private TxtFileWriter pageInlinks; private TxtFileWriter pageOutlinks; private TxtFileWriter page; private TxtFileWriter pageMapLine; private TxtFileWriter pageRedirects; private String outputDir; // galkin: added private ILogger logger; private boolean skipPage = true; private boolean skipCategory = true; /** * Returns the String value of the bit 1 if the given boolean is true
* and an empty String otherwise. This the way bit values are written
* in .txt dump files. * * @param b * @return */ private String formatBoolean(boolean b) { return b ? new String(new byte[] { 1 }) : ""; } @Override public void exportAfterCategoryLinksParsing() throws IOException { pageCategories.export(); categoryPages.export(); categoryInlinks.export(); categoryOutlinks.export(); } @Override public void exportAfterPageLinksParsing() throws IOException { pageInlinks.export(); pageOutlinks.export(); } @Override public void exportAfterPageParsing() throws IOException { txtFW.export(); nrOfCategories = cPageIdNameMap.keySet().size(); nrOfPages = pPageIdNameMap.keySet().size() + rPageIdNameMap.keySet().size(); } @Override public void exportAfterRevisionParsing() throws IOException { } @Override public void exportAfterTextParsing() throws IOException { page.export(); pageRedirects.export(); pageMapLine.export(); } @Override public void flushByTextParsing() throws IOException { page.flush(); pageRedirects.flush(); pageMapLine.flush(); } @Override public void freeAfterCategoryLinksParsing() { } @Override public void freeAfterPageLinksParsing() { } @Override public void freeAfterPageParsing() { } @Override public void freeAfterRevisonParsing() { } @Override public void freeAfterTextParsing() { page.export(); pageRedirects.export(); pageMapLine.export(); } @Override public void initCategoryLinksParsing() throws IOException { pageCategories = new TxtFileWriter(outputDir + File.separator + "page_categories.txt"); categoryPages = new TxtFileWriter(outputDir + File.separator + "category_pages.txt"); categoryInlinks = new TxtFileWriter(outputDir + File.separator + "category_inlinks.txt"); categoryOutlinks = new TxtFileWriter(outputDir + File.separator + "category_outlinks.txt"); } @Override public void initPageLinksParsing() throws IOException { pageInlinks = new TxtFileWriter(outputDir + File.separator + "page_inlinks.txt"); pageOutlinks = new TxtFileWriter(outputDir + File.separator + "page_outlinks.txt"); } @Override public void initPageParsing() throws IOException { txtFW = new TxtFileWriter(outputDir + File.separator + "Category.txt"); } @Override public void initRevisionParsion() { } @Override public void initTextParsing() throws IOException { page = new TxtFileWriter(outputDir + File.separator + "Page.txt"); pageMapLine = new TxtFileWriter(outputDir + File.separator + "PageMapLine.txt"); pageRedirects = new TxtFileWriter(outputDir + File.separator + "page_redirects.txt"); } @Override public void initialize(Timestamp timestamp) { this.pPageIdNameMap = new HashMap(); this.cPageIdNameMap = new HashMap(); this.pNamePageIdMap = new HashMap(); this.cNamePageIdMap = new HashMap(); this.rPageIdNameMap = new HashMap(); this.disambiguations = new HashSet(); this.textIdPageIdMap = new HashMap(); } @Override public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { int cl_from; String cl_to; cl_from = clParser.getClFrom(); cl_to = clParser.getClTo(); if (!cNamePageIdMap.containsKey(cl_to)) {// discard links with non // registred targets return; } // if the link source is a page then write the link in // category_pages and // page_categories if (pPageIdNameMap.containsKey(cl_from)) { categoryPages.addRow(cNamePageIdMap.get(cl_to), cl_from); pageCategories.addRow(cl_from, cNamePageIdMap.get(cl_to)); if (cl_to.equals(disambiguationsCategory)) { disambiguations.add(cl_from); nrOfDisambiguations++; } } else { // if the link source is a category than write the link in // category_inlinks and category_outlinks if (cPageIdNameMap.containsKey(cl_from)) { categoryOutlinks.addRow(cNamePageIdMap.get(cl_to), cl_from); categoryInlinks.addRow(cl_from, cNamePageIdMap.get(cl_to)); } } } @Override public void processPageLinksRow(PagelinksParser plParser) throws IOException { int pl_from; String pl_to; pl_from = plParser.getPlFrom(); pl_to = plParser.getPlTo(); // skip redirects or page with other namespace than 0 if (skipPage && !pPageIdNameMap.containsKey(pl_from) || !pNamePageIdMap.containsKey(pl_to)) { return; } pageOutlinks.addRow(pl_from, pNamePageIdMap.get(pl_to)); pageInlinks.addRow(pNamePageIdMap.get(pl_to), pl_from); } @Override public void processPageRow(PageParser pageParser) throws IOException { int page_id; int page_namespace; String page_title; page_namespace = pageParser.getPageNamespace(); // handle categories if (page_namespace == 14) { if (skipCategory) { if (pageParser.getPageIsRedirect()) // skip categories that are redirects return; } // retrieve page id and page title page_id = pageParser.getPageId(); page_title = pageParser.getPageTitle(); if (page_title.equals(disambiguationsCategory)) { logger.log("Disambiguations Category found: " + page_title); } if (page_title.equals(mainCategory)) { logger.log("Main Category found: " + page_title); } // cache the retrieved values cPageIdNameMap.put(page_id, page_title); cNamePageIdMap.put(page_title, page_id); // write a new row in the table Category. // Note that we also consider the page_id as id txtFW.addRow(page_id, page_id, page_title); return; } // handle pages if (page_namespace == 0) { // retrieve page id and title page_id = pageParser.getPageId(); page_title = pageParser.getPageTitle(); // distinguish redirects if (pageParser.getPageIsRedirect()) { rPageIdNameMap.put(page_id, page_title); } else { pPageIdNameMap.put(page_id, page_title); pNamePageIdMap.put(page_title, page_id); } } } @Override public void processRevisionRow(RevisionParser revisionParser) { textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser .getRevPage()); } @Override public void processTextRow(TextParser textParser) throws IOException { String destination; int text_id; int page_id; text_id = textParser.getOldId(); if (!textIdPageIdMap.containsKey(text_id)) return; page_id = textIdPageIdMap.get(text_id); if (pPageIdNameMap.containsKey(page_id)) {// pages page.addRow(page_id, page_id, pPageIdNameMap.get(page_id), textParser.getOldText(), formatBoolean(disambiguations .contains(page_id))); pageMapLine.addRow(page_id, pPageIdNameMap.get(page_id), page_id, "NULL", "NULL"); return; } if (rPageIdNameMap.containsKey(page_id)) {// Redirects destination = Redirects.getRedirectDestination(textParser .getOldText()); if (!pNamePageIdMap.containsKey(destination)) return; pageRedirects.addRow(pNamePageIdMap.get(destination), rPageIdNameMap.get(page_id)); pageMapLine.addRow(page_id, rPageIdNameMap.get(page_id), pNamePageIdMap.get(destination), "NULL", "NULL"); nrOfRedirects++; } } @Override public void setFiles(Files versionFiles) { // galkin: only output directory will be used, other file names will be // taken from original source code outputDir = versionFiles.getOutputDirectory().getAbsolutePath(); } @Override public void setLogger(ILogger logger) { this.logger = logger; } @Override public void setMetaData(MetaData commonMetaData) { this.language = commonMetaData.getLanguage(); this.mainCategory = commonMetaData.getMainCategory(); this.disambiguationsCategory = commonMetaData .getDisambiguationCategory(); } @Override public void writeMetaData() throws IOException { TxtFileWriter metaData = new TxtFileWriter(outputDir + File.separator + "MetaData.txt"); // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories metaData.addRow("null", language, disambiguationsCategory, mainCategory, nrOfPages, nrOfRedirects, nrOfDisambiguations, nrOfCategories); metaData.export(); } @Override public void setCategoryRedirectsSkip(boolean skipCategory) { this.skipCategory = skipCategory; } @Override public void setPageRedirectsSkip(boolean skipPage) { this.skipPage = skipPage; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy