All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.refine.importers.WikitextImporter Maven / Gradle / Ivy

Go to download

OpenRefine is a free, open source power tool for working with messy data and improving it

There is a newer version: 3.9-beta1
Show newest version
/*******************************************************************************
 * Copyright (C) 2018, OpenRefine contributors
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************/

package com.google.refine.importers;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.base.CharMatcher;
import com.google.common.io.CharStreams;
import de.fau.cs.osr.ptk.common.AstVisitor;
import org.sweble.wikitext.parser.ParserConfig;
import org.sweble.wikitext.parser.WikitextEncodingValidator;
import org.sweble.wikitext.parser.WikitextParser;
import org.sweble.wikitext.parser.WikitextPreprocessor;
import org.sweble.wikitext.parser.encval.ValidatedWikitext;
import org.sweble.wikitext.parser.nodes.WtBody;
import org.sweble.wikitext.parser.nodes.WtBold;
import org.sweble.wikitext.parser.nodes.WtExternalLink;
import org.sweble.wikitext.parser.nodes.WtImageLink;
import org.sweble.wikitext.parser.nodes.WtInternalLink;
import org.sweble.wikitext.parser.nodes.WtItalics;
import org.sweble.wikitext.parser.nodes.WtLinkTitle;
import org.sweble.wikitext.parser.nodes.WtLinkTitle.WtNoLinkTitle;
import org.sweble.wikitext.parser.nodes.WtName;
import org.sweble.wikitext.parser.nodes.WtNewline;
import org.sweble.wikitext.parser.nodes.WtNode;
import org.sweble.wikitext.parser.nodes.WtParsedWikitextPage;
import org.sweble.wikitext.parser.nodes.WtPreproWikitextPage;
import org.sweble.wikitext.parser.nodes.WtSection;
import org.sweble.wikitext.parser.nodes.WtTable;
import org.sweble.wikitext.parser.nodes.WtTableCaption;
import org.sweble.wikitext.parser.nodes.WtTableCell;
import org.sweble.wikitext.parser.nodes.WtTableHeader;
import org.sweble.wikitext.parser.nodes.WtTableRow;
import org.sweble.wikitext.parser.nodes.WtTagExtension;
import org.sweble.wikitext.parser.nodes.WtTagExtensionBody;
import org.sweble.wikitext.parser.nodes.WtTemplate;
import org.sweble.wikitext.parser.nodes.WtTemplateArgument;
import org.sweble.wikitext.parser.nodes.WtTemplateArguments;
import org.sweble.wikitext.parser.nodes.WtText;
import org.sweble.wikitext.parser.nodes.WtUrl;
import org.sweble.wikitext.parser.nodes.WtValue;
import org.sweble.wikitext.parser.nodes.WtXmlAttribute;
import org.sweble.wikitext.parser.nodes.WtXmlAttributes;
import org.sweble.wikitext.parser.nodes.WtXmlEmptyTag;
import org.sweble.wikitext.parser.nodes.WtXmlEndTag;
import org.sweble.wikitext.parser.nodes.WtXmlStartTag;
import org.sweble.wikitext.parser.parser.PreprocessorToParserTransformer;
import org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext;
import org.sweble.wikitext.parser.utils.SimpleParserConfig;
import xtc.parser.ParseException;

import com.google.refine.ProjectMetadata;
import com.google.refine.importing.ImportingJob;
import com.google.refine.model.Cell;
import com.google.refine.model.Column;
import com.google.refine.model.Project;
import com.google.refine.model.Recon;
import com.google.refine.model.ReconStats;
import com.google.refine.model.recon.ReconJob;
import com.google.refine.model.recon.StandardReconConfig;
import com.google.refine.model.recon.StandardReconConfig.ColumnDetail;
import com.google.refine.util.JSONUtilities;

public class WikitextImporter extends TabularImportingParserBase {
    // static final private Logger logger = LoggerFactory.getLogger(WikitextImporter.class);

    public WikitextImporter() {
        super(false);
    }

    @Override
    public ObjectNode createParserUIInitializationData(
            ImportingJob job, List fileRecords, String format) {
        ObjectNode options = super.createParserUIInitializationData(job, fileRecords, format);

        JSONUtilities.safePut(options, "guessCellValueTypes", false);
        JSONUtilities.safePut(options, "blankSpanningCells", true);
        JSONUtilities.safePut(options, "includeRawTemplates", false);
        JSONUtilities.safePut(options, "wikiUrl", "https://en.wikipedia.org/wiki/");
        JSONUtilities.safePut(options, "parseReferences", true);

        return options;
    }

    private class SpanningCell {

        public String value;
        public String reconciled;
        public String reference;
        public int colspan;
        public int rowspan;
        public int row;
        public int col;

        SpanningCell(String value, String reconciled, String reference, int row, int col, int rowspan, int colspan) {
            this.value = value;
            this.reconciled = reconciled;
            this.reference = reference;
            this.row = row;
            this.col = col;
            this.rowspan = rowspan;
            this.colspan = colspan;
        }
    }

    private class WikilinkedCell {

        public String internalLink;
        public int row;
        public int col;

        WikilinkedCell(String internalLink, int row, int col) {
            this.internalLink = internalLink;
            this.row = row;
            this.col = col;
        }

        public String toURL(String wikiBaseUrl) {
            return wikiBaseUrl + internalLink;
        }
    }

    public class WikitextTableVisitor extends AstVisitor {

        public String caption;
        public List> rows;
        public List> references;
        public List wikilinkedCells;

        private List currentRow;
        private List currentRowReferences;
        private Map namedReferences;

        private boolean blankSpanningCells;
        private boolean includeRawTemplates;

        private int rowId;
        private List spanningCells;
        private StringBuilder cellStringBuilder;
        private StringBuilder xmlAttrStringBuilder;
        private String currentXmlAttr;
        private String currentInternalLink;
        private String currentExternalLink;
        private String lastExternalLink;
        private String currentReference;
        private String currentReferenceName;
        private int colspan;
        private int rowspan;
        private int spanningCellIdx;
        private List internalLinksInCell;

        private final Pattern urlPattern = Pattern.compile("\\b(https?|ftp)://[-a-zA-Z0-9+&@#/%?=~_!:,.;]*[-a-zA-Z0-9+&@#/%=~_]",
                Pattern.CASE_INSENSITIVE);

        public WikitextTableVisitor(boolean blankSpanningCells, boolean includeRawTemplates) {
            this.blankSpanningCells = blankSpanningCells;
            this.includeRawTemplates = includeRawTemplates;
            caption = null;
            rows = new ArrayList>();
            references = new ArrayList>();
            wikilinkedCells = new ArrayList();
            spanningCells = new ArrayList();
            cellStringBuilder = null;
            xmlAttrStringBuilder = null;
            currentRowReferences = null;
            currentInternalLink = null;
            currentExternalLink = null;
            lastExternalLink = null;
            currentReference = null;
            currentReferenceName = null;
            colspan = 0;
            rowspan = 0;
            rowId = 0;
            spanningCellIdx = 0;
            internalLinksInCell = new ArrayList();
            namedReferences = new HashMap();
        }

        @Override
        protected WtNode before(WtNode node) {
            return super.before(node);
        }

        /* Default handler */

        public void visit(WtNode e) {
            // Ignore other nodes
            // System.out.println(e.getNodeName());
        }

        /* Table handling */

        public void visit(WtTable e) {
            iterate(e);
        }

        public void visit(WtTableCaption e) {
            caption = renderCellAsString(e);
        }

        public void visit(WtTableRow e) {
            if (currentRow != null) {
                finishRow();
            }
            startRow();
            iterate(e);
            finishRow();
        }

        private void startRow() {
            currentRow = new ArrayList();
            currentRowReferences = new ArrayList();
            spanningCellIdx = 0;
            addSpanningCells();
        }

        private void finishRow() {
            if (currentRow.size() > 0) {
                rows.add(currentRow);
                references.add(currentRowReferences);
                rowId++;
            }
            currentRow = null;
        }

        public void visit(WtTableCell e) {
            addCell(e);
        }

        public void visit(WtTableHeader e) {
            addCell(e);
        }

        public void addCell(WtNode e) {
            if (currentRow == null) {
                startRow();
            }
            rowspan = 1;
            colspan = 1;
            internalLinksInCell.clear();
            currentReference = null;
            currentReferenceName = null;

            String value = renderCellAsString(e);

            int colId = currentRow.size();

            // Add the cell to the row we are currently building
            currentRow.add(value);
            currentRowReferences.add(currentReference);

            // Reconcile it if we found exactly one link in the cell
            String reconciled = null;
            if (internalLinksInCell.size() == 1) {
                reconciled = internalLinksInCell.get(0);
                wikilinkedCells.add(new WikilinkedCell(reconciled, rowId, colId));
            }

            // Mark it as spanning if we found the tags
            if (colspan > 1 || rowspan > 1) {
                SpanningCell spanningCell = new SpanningCell(
                        value, reconciled, currentReference,
                        rowId, colId, rowspan, colspan);
                spanningCells.add(spanningCellIdx, spanningCell);
            }

            // Add all spanning cells that need to be inserted after this one.
            addSpanningCells();
        }

        public String renderCellAsString(WtNode e) {
            cellStringBuilder = new StringBuilder();
            iterate(e);
            String value = cellStringBuilder.toString();
            if (value == null) {
                value = "";
            }
            value = CharMatcher.whitespace().trimFrom(value);
            cellStringBuilder = null;
            return value;
        }

        public void visit(WtText text) {
            writeText(text.getContent());
        }

        public void visit(WtNewline e) {
            writeText("\n");
        }

        public void visit(WtXmlEmptyTag tag) {
            if ("br".equals(tag.getName())) {
                writeText("\n");
            }
        }

        public void visit(WtXmlStartTag tag) {
            if ("br".equals(tag.getName())) {
                writeText("\n");
            }
        }

        public void visit(WtXmlEndTag tag) {
            if ("br".equals(tag.getName())) {
                writeText("\n");
            }
        }

        public void visit(WtTagExtension tag) {
            if ("ref".equals(tag.getName())) {
                lastExternalLink = null;
                currentReferenceName = null;

                iterate(tag);

                // load any reference parsed earlier
                if (currentReferenceName != null) {
                    currentReference = namedReferences.get(currentReferenceName);
                } else {
                    currentReferenceName = "";
                }
                // update with any new link found in the body of the reference
                if (lastExternalLink != null) {
                    currentReference = lastExternalLink;
                }

                // store the reference for later use
                if (currentReference != null && !"".equals(currentReferenceName)) {
                    namedReferences.put(currentReferenceName, currentReference);
                }
            }
        }

        public void visit(WtTagExtensionBody body) {
            /*
             * Here, the content of the  tag is not parsed further, it's just a String. So we have to resort to
             * string matching. https://github.com/sweble/sweble-wikitext/issues/67
             */
            String contents = body.getContent();
            Matcher matcher = urlPattern.matcher(contents);
            while (matcher.find()) {
                lastExternalLink = contents.substring(matcher.start(), matcher.end());
            }
        }

        public void writeText(String text) {
            // do not render text that is inside 
            if (currentReferenceName == null) {
                if (xmlAttrStringBuilder != null) {
                    xmlAttrStringBuilder.append(text);
                } else if (cellStringBuilder != null) {
                    cellStringBuilder.append(text);
                }
            }
        }

        /* Spanning cell helpers */

        private SpanningCell spanningCell() {
            return spanningCells.get(spanningCellIdx);
        }

        private void addSpanningCells() {
            while (spanningCellIdx < spanningCells.size() &&
                    currentRow.size() >= spanningCell().col) {
                // Add blank cells to represent the current spanning cell
                SpanningCell cell = spanningCell();
                if (cell.row + cell.rowspan >= rowId + 1) {
                    while (currentRow.size() < cell.col + cell.colspan) {
                        if (blankSpanningCells) {
                            currentRow.add(null);
                            currentRowReferences.add(null);
                        } else {
                            currentRow.add(cell.value);
                            currentRowReferences.add(cell.reference);
                            if (cell.reconciled != null) {
                                wikilinkedCells.add(new WikilinkedCell(cell.reconciled, rowId, currentRow.size() - 1));
                            }
                        }
                    }
                }
                // Check if this spanning cell has been fully represented
                if (cell.row + cell.rowspan <= rowId + 1) {
                    spanningCells.remove(spanningCellIdx);
                } else {
                    spanningCellIdx++;
                }
            }
        }

        /* XML attributes : useful for colspan and rowspan, and reference names */

        public void visit(WtXmlAttributes e) {
            iterate(e);
        }

        public void visit(WtXmlAttribute e) {
            xmlAttrStringBuilder = new StringBuilder();
            iterate(e);
            try {
                if ("colspan".equals(currentXmlAttr)) {
                    colspan = Integer.parseInt(xmlAttrStringBuilder.toString());
                } else if ("rowspan".equals(currentXmlAttr)) {
                    rowspan = Integer.parseInt(xmlAttrStringBuilder.toString());
                } else if ("name".equals(currentXmlAttr)) {
                    currentReferenceName = xmlAttrStringBuilder.toString();
                }
            } catch (NumberFormatException nfe) {
            }
            currentXmlAttr = null;
            xmlAttrStringBuilder = null;
        }

        public void visit(WtName e) {
            try {
                currentXmlAttr = e.getAsString();
            } catch (UnsupportedOperationException soe) {
                currentXmlAttr = null;
            }
        }

        public void visit(WtValue e) {
            iterate(e);
        }

        /* Link management */

        public void visit(WtInternalLink e) {
            currentInternalLink = e.getTarget().getAsString();
            internalLinksInCell.add(currentInternalLink);
            iterate(e);
            currentInternalLink = null;
        }

        public void visit(WtExternalLink e) {
            WtUrl url = e.getTarget();
            String externalLink = url.getProtocol() + ":" + url.getPath();
            if (cellStringBuilder != null) {
                if (rowId >= 0) {
                    // We are inside the table: all hyperlinks
                    // should be converted to their URLs regardless of
                    // their label.
                    cellStringBuilder.append(externalLink);
                } else {
                    // We are in the header: keep the labels instead
                    currentExternalLink = externalLink;
                    iterate(e);
                    currentExternalLink = null;
                }
            }
            lastExternalLink = externalLink;
        }

        public void visit(WtNoLinkTitle e) {
            if (cellStringBuilder != null) {
                if (currentInternalLink != null) {
                    cellStringBuilder.append(currentInternalLink);
                } else if (currentExternalLink != null) {
                    cellStringBuilder.append(currentExternalLink);
                }
            }
        }

        public void visit(WtLinkTitle e) {
            iterate(e);
        }

        public void visit(WtUrl e) {
            // already handled, in WtExternalLink, added here for clarity
        }

        /* Templates */

        public void visit(WtTemplate e) {
            // only render templates if we are told to do so or inside a reference
            if (includeRawTemplates || currentReferenceName != null) {
                writeText("{{" + e.getName().getAsString());
                WtTemplateArguments args = e.getArgs();
                for (int i = 0; i != args.size(); i++) {
                    writeText("|");
                    iterate(args.get(i));
                }
                writeText("}}");
            }
        }

        public void visit(WtTemplateArgument e) {
            // do not render templates that are inside a reference
            if (currentReferenceName == null) {
                writeText("|");
                if (e.hasName()) {
                    writeText(e.getName().getAsString());
                    writeText("=");
                }
            }
            iterate(e.getValue());
        }

        public void visit(WtImageLink e) {
            if (includeRawTemplates) {
                writeText("[[");
                writeText(e.getTarget().getAsString());
                writeText("]]");
            }
        }

        /* Content blocks */

        public void visit(WtParsedWikitextPage e) {
            iterate(e);
        }

        public void visit(WtSection e) {
            iterate(e);
        }

        public void visit(WtBody e) {
            iterate(e);
        }

        public void visit(WtItalics e) {
            iterate(e);
        }

        public void visit(WtBold e) {
            iterate(e);
        }

        @Override
        protected Object after(WtNode node, Object result) {
            return rows;
        }
    }

    public class WikiTableDataReader implements TableDataReader {

        private int currentRow = 0;
        private WikitextTableVisitor visitor = null;
        private List> reconList = null;
        private List columnReconciled = null;
        private List columnReferenced = null;

        public WikiTableDataReader(WikitextTableVisitor visitor, boolean references) {
            this.visitor = visitor;
            currentRow = 0;
            reconList = null;

            if (references) {
                // Check which column had references
                columnReferenced = new ArrayList();
                for (List row : this.visitor.references) {
                    for (int i = 0; i != row.size(); i++) {
                        while (i >= columnReferenced.size()) {
                            columnReferenced.add(false);
                        }
                        if (row.get(i) != null) {
                            columnReferenced.set(i, true);
                        }
                    }
                }
            }
        }

        @Override
        public List getNextRowOfCells() throws IOException {
            List row = null;
            List origRow = null;
            List refRow = null;
            if (currentRow < this.visitor.rows.size()) {
                origRow = this.visitor.rows.get(currentRow);
                refRow = this.visitor.references.get(currentRow);
            }

            if (origRow != null) {
                row = new ArrayList();
                for (int i = 0; i < origRow.size(); i++) {
                    Recon recon = null;
                    if (currentRow >= 0 && reconList != null) {
                        recon = reconList.get(currentRow).get(i);
                    }
                    String value = origRow.get(i);
                    if (value != null) {
                        row.add(new Cell(value, recon));
                    } else {
                        row.add(null);
                    }

                    // if we should add reference columns…
                    if (columnReferenced != null && i < columnReferenced.size() && columnReferenced.get(i)) {
                        String refValue = null;
                        // for headers
                        if (currentRow == -1) {
                            refValue = origRow.get(i) + "_ref";
                        } else {
                            refValue = refRow.get(i);
                        }
                        if (refValue != null) {
                            row.add(new Cell(refValue, null));
                        } else {
                            row.add(null);
                        }
                    }
                }
            }
            currentRow++;
            return row;
        }

        private void reconcileToQids(String wikiBaseUrl, StandardReconConfig cfg) {
            if ("null".equals(wikiBaseUrl)) {
                return; // TODO: more thorough URL validation instead
            }

            // Init the list of recons
            reconList = new ArrayList>();
            columnReconciled = new ArrayList();
            for (int i = 0; i < this.visitor.rows.size(); i++) {
                int rowSize = this.visitor.rows.get(i).size();
                List recons = new ArrayList(rowSize);
                for (int j = 0; j < rowSize; j++) {
                    recons.add(null);
                    if (j >= columnReconciled.size())
                        columnReconciled.add(false);
                }
                reconList.add(recons);

            }

            int batchSize = 50;
            int i = 0;
            int totalSize = this.visitor.wikilinkedCells.size();
            while (i < totalSize) {
                List jobs = new ArrayList();
                int batchStart = i;
                while (i < batchStart + batchSize && i < totalSize) {
                    WikilinkedCell cell = this.visitor.wikilinkedCells.get(i);
                    jobs.add(cfg.createSimpleJob(cell.toURL(wikiBaseUrl)));
                    i++;
                }

                List recons = cfg.batchRecon(jobs, 0);
                for (int j = batchStart; j < batchStart + batchSize && j < totalSize; j++) {
                    WikilinkedCell cell = this.visitor.wikilinkedCells.get(j);
                    Recon recon = recons.get(j - batchStart);
                    if (recon != null) {
                        reconList.get(cell.row).set(cell.col, recon);
                        columnReconciled.set(cell.col, true);
                    }
                }
            }
        }
    }

    @Override
    public void parseOneFile(
            Project project,
            ProjectMetadata metadata,
            ImportingJob job,
            String fileSource,
            Reader reader,
            int limit,
            ObjectNode options,
            List exceptions) {
        // Set-up a simple wiki configuration
        ParserConfig parserConfig = new SimpleParserConfig();

        try {
            // Encoding validation

            WikitextEncodingValidator v = new WikitextEncodingValidator();

            String wikitext = CharStreams.toString(reader);
            String title = "Page title";
            ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);

            // Pre-processing
            WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);

            WtPreproWikitextPage prepArticle = (WtPreproWikitextPage) prep.parseArticle(validated, title, false);

            // Parsing
            PreprocessedWikitext ppw = PreprocessorToParserTransformer
                    .transform(prepArticle);

            WikitextParser parser = new WikitextParser(parserConfig);

            WtParsedWikitextPage parsedArticle;
            parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title);

            // Compile the retrieved page
            boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
            boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
            boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true);
            final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
            vs.go(parsedArticle);

            WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences);

            // Reconcile if needed
            String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
            // Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
            String reconUrl = JSONUtilities.getString(options, "reconService",
                    "https://wikidata.reconci.link/en/api");
            StandardReconConfig cfg = getReconConfig(reconUrl);

            if (wikiUrl != null) {
                dataReader.reconcileToQids(wikiUrl, cfg);
            }

            // Set metadata
            if (vs.caption != null && vs.caption.length() > 0) {
                metadata.setName(vs.caption);
                // TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
            }

            TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);

            // Add reconciliation statistics
            if (dataReader.columnReconciled != null) {
                for (int i = 0; i != dataReader.columnReconciled.size(); i++) {
                    if (dataReader.columnReconciled.get(i)) {
                        Column col = project.columnModel.columns.get(i);
                        col.setReconStats(ReconStats.create(project, i));
                        col.setReconConfig(cfg);
                    }
                }
            }
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (ParseException e1) {
            exceptions.add(e1);
            e1.printStackTrace();
        }
    }

    private StandardReconConfig getReconConfig(String url) {
        StandardReconConfig cfg = new StandardReconConfig(
                url,
                "http://www.wikidata.org/entity/",
                "http://www.wikidata.org/prop/direct/",
                "",
                "entity",
                true,
                10,
                new ArrayList(),
                1);
        return cfg;
    }

}