All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.nutch.parse.ParsePluginsReader Maven / Gradle / Ivy

There is a newer version: 2.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse;

import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.util.NutchConfiguration;

/**
 * A reader to load the information stored in the
 * $NUTCH_HOME/conf/parse-plugins.xml file.
 * 
 * @author mattmann
 * @version 1.0
 */
class ParsePluginsReader {

  /* our log stream */
  private static final Logger LOG = LoggerFactory
      .getLogger(MethodHandles.lookup().lookupClass());

  /** The property name of the parse-plugins location */
  private static final String PP_FILE_PROP = "parse.plugin.file";

  /** the parse-plugins file */
  private String fParsePluginsFile = null;

  /**
   * Constructs a new ParsePluginsReader
   */
  public ParsePluginsReader() {
  }

  /**
   * Reads the parse-plugins.xml file and returns the
   * {@link #ParsePluginList} defined by it.
   * 
   * @return A {@link #ParsePluginList} specified by the
   *         parse-plugins.xml file.
   * @throws Exception
   *           If any parsing error occurs.
   */
  public ParsePluginList parse(Configuration conf) {

    ParsePluginList pList = new ParsePluginList();

    // open up the XML file
    DocumentBuilderFactory factory = null;
    DocumentBuilder parser = null;
    Document document = null;
    InputSource inputSource = null;

    InputStream ppInputStream = null;
    if (fParsePluginsFile != null) {
      URL parsePluginUrl = null;
      try {
        parsePluginUrl = new URL(fParsePluginsFile);
        ppInputStream = parsePluginUrl.openStream();
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Unable to load parse plugins file from URL " + "["
              + fParsePluginsFile + "]. Reason is [" + e + "]");
        }
        return pList;
      }
    } else {
      ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
    }

    inputSource = new InputSource(ppInputStream);

    try {
      factory = DocumentBuilderFactory.newInstance();
      parser = factory.newDocumentBuilder();
      document = parser.parse(inputSource);
    } catch (Exception e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
            + e + "]");
      }
      return null;
    }

    Element parsePlugins = document.getDocumentElement();

    // build up the alias hash map
    Map aliases = getAliases(parsePlugins);
    // And store it on the parse plugin list
    pList.setAliases(aliases);

    // get all the mime type nodes
    NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");

    // iterate through the mime types
    for (int i = 0; i < mimeTypes.getLength(); i++) {
      Element mimeType = (Element) mimeTypes.item(i);
      String mimeTypeStr = mimeType.getAttribute("name");

      // for each mimeType, get the plugin list
      NodeList pluginList = mimeType.getElementsByTagName("plugin");

      // iterate through the plugins, add them in order read
      // OR if they have a special order="" attribute, then hold those in
      // a separate list, and then insert them into the final list at the
      // order specified
      if (pluginList != null && pluginList.getLength() > 0) {
        List plugList = new ArrayList<>(pluginList.getLength());

        for (int j = 0; j < pluginList.getLength(); j++) {
          Element plugin = (Element) pluginList.item(j);
          String pluginId = plugin.getAttribute("id");
          String extId = aliases.get(pluginId);
          if (extId == null) {
            // Assume an extension id is directly specified
            extId = pluginId;
          }
          String orderStr = plugin.getAttribute("order");
          int order = -1;
          try {
            order = Integer.parseInt(orderStr);
          } catch (NumberFormatException ignore) {
          }
          if (order != -1) {
            plugList.add(order - 1, extId);
          } else {
            plugList.add(extId);
          }
        }

        // now add the plugin list and map it to this mimeType
        pList.setPluginList(mimeTypeStr, plugList);

      } else if (LOG.isWarnEnabled()) {
        LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
            + mimeTypeStr + ", continuing parse");
      }
    }
    return pList;
  }

  /**
   * Tests parsing of the parse-plugins.xml file. An alternative name for the
   * file can be specified via the --file option, although the file
   * must be located in the $NUTCH_HOME/conf directory.
   * 
   * @param args
   *          Currently only the --file argument to specify an alternative name
   *          for the parse-plugins.xml file is supported.
   */
  public static void main(String[] args) throws Exception {
    String parsePluginFile = null;
    String usage = "ParsePluginsReader [--file ]";

    if ((args.length != 0 && args.length != 2)
        || (args.length == 2 && !"--file".equals(args[0]))) {
      System.err.println(usage);
      System.exit(1);
    }

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("--file")) {
        parsePluginFile = args[++i];
      }
    }

    ParsePluginsReader reader = new ParsePluginsReader();

    if (parsePluginFile != null) {
      reader.setFParsePluginsFile(parsePluginFile);
    }

    ParsePluginList prefs = reader.parse(NutchConfiguration.create());

    for (String mimeType : prefs.getSupportedMimeTypes()) {

      System.out.println("MIMETYPE: " + mimeType);
      List plugList = prefs.getPluginList(mimeType);

      System.out.println("EXTENSION IDs:");

      for (String j : plugList) {
        System.out.println(j);
      }
    }

  }

  /**
   * @return Returns the fParsePluginsFile.
   */
  public String getFParsePluginsFile() {
    return fParsePluginsFile;
  }

  /**
   * @param parsePluginsFile
   *          The fParsePluginsFile to set.
   */
  public void setFParsePluginsFile(String parsePluginsFile) {
    fParsePluginsFile = parsePluginsFile;
  }

  private Map getAliases(Element parsePluginsRoot) {

    Map aliases = new HashMap<>();
    NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");

    if (aliasRoot == null || aliasRoot.getLength() == 0) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("No aliases defined in parse-plugins.xml!");
      }
      return aliases;
    }

    if (aliasRoot.getLength() > 1) {
      // log a warning, but try and continue processing
      if (LOG.isWarnEnabled()) {
        LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml");
      }
    }

    Element aliasRootElem = (Element) aliasRoot.item(0);
    NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");

    if (aliasElements != null && aliasElements.getLength() > 0) {
      for (int i = 0; i < aliasElements.getLength(); i++) {
        Element aliasElem = (Element) aliasElements.item(i);
        String parsePluginId = aliasElem.getAttribute("name");
        String extensionId = aliasElem.getAttribute("extension-id");
        if (LOG.isTraceEnabled()) {
          LOG.trace("Found alias: plugin-id: " + parsePluginId
              + ", extension-id: " + extensionId);
        }
        if (parsePluginId != null && extensionId != null) {
          aliases.put(parsePluginId, extensionId);
        }
      }
    }
    return aliases;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy