org.apache.solr.update.processor.URLClassifyProcessor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.update.processor;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Locale;
import java.util.Objects;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Update processor which examines a URL and outputs to various other fields characteristics of that
 * URL, including length, number of path levels, whether it is a top level URL (levels==0), whether
 * it looks like a landing/index page, a canonical representation of the URL (e.g. stripping
 * index.html), the domain and path parts of the URL etc.
 *
 * This processor is intended used in connection with processing web resources, and helping to
 * produce values which may be used for boosting or filtering later.
 *
 * 
In the example configuration below, we construct a custom updateRequestProcessorChain
 *  and then instruct the /update requesthandler to use it for every incoming
 * document.
 *
 * 
 * <updateRequestProcessorChain name="urlProcessor">
 *   <processor class="org.apache.solr.update.processor.URLClassifyProcessorFactory">
 *     <bool name="enabled">true</bool>
 *     <str name="inputField">id</str>
 *     <str name="domainOutputField">hostname</str>
 *   </processor>
 *   <processor class="solr.RunUpdateProcessorFactory" />
 * </updateRequestProcessorChain>
 *
 * <requestHandler name="/update" class="solr.UpdateRequestHandler">
 * <lst name="defaults">
 * <str name="update.chain">urlProcessor</str>
 * </lst>
 * </requestHandler>
 * 
 *
 * Then, at index time, Solr will look at the id field value and extract it's domain
 * portion into a new hostname field. By default, the following fields will also be
 * added:
 *
 * 

 *   url_length
 *   
url_levels
 *   
url_toplevel
 *   
url_landingpage
 * 
 *
 * For example, adding the following document
 *
 * 
 * { "id":"http://wwww.mydomain.com/subpath/document.html" }
 * 
 *
 * will result in this document in Solr:
 *
 * 
 * {
 *  "id":"http://wwww.mydomain.com/subpath/document.html",
 *  "url_length":46,
 *  "url_levels":2,
 *  "url_toplevel":0,
 *  "url_landingpage":0,
 *  "hostname":"wwww.mydomain.com",
 *  "_version_":1603193062117343232}]
 * }
 * 
 */
public class URLClassifyProcessor extends UpdateRequestProcessor {

  private static final String INPUT_FIELD_PARAM = "inputField";
  private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
  private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
  private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
  private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
  private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
  private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
  private static final String DEFAULT_URL_FIELDNAME = "url";
  private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
  private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
  private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
  private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
  private boolean enabled = true;
  private String urlFieldname = DEFAULT_URL_FIELDNAME;
  private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
  private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
  private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
  private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
  private String domainFieldname = null;
  private String canonicalUrlFieldname = null;
  private static final String[] landingPageSuffixes = {
    "/",
    "index.html",
    "index.htm",
    "index.phtml",
    "index.shtml",
    "index.xml",
    "index.php",
    "index.asp",
    "index.aspx",
    "welcome.html",
    "welcome.htm",
    "welcome.phtml",
    "welcome.shtml",
    "welcome.xml",
    "welcome.php",
    "welcome.asp",
    "welcome.aspx"
  };

  public URLClassifyProcessor(
      SolrParams parameters,
      SolrQueryRequest request,
      SolrQueryResponse response,
      UpdateRequestProcessor nextProcessor) {
    super(nextProcessor);

    this.initParameters(parameters);
  }

  private void initParameters(SolrParams parameters) {
    if (parameters != null) {
      this.setEnabled(parameters.getBool("enabled", true));
      this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
      this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
      this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
      this.toplevelpageFieldname =
          parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
      this.landingpageFieldname =
          parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
      this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
      this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
    }
  }

  @Override
  public void processAdd(AddUpdateCommand command) throws IOException {
    if (isEnabled()) {
      SolrInputDocument document = command.getSolrInputDocument();
      if (document.containsKey(urlFieldname)) {
        String url = (String) document.getFieldValue(urlFieldname);
        try {
          URL normalizedURL = getNormalizedURL(url);
          document.setField(lengthFieldname, length(normalizedURL));
          document.setField(levelsFieldname, levels(normalizedURL));
          document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
          document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
          if (domainFieldname != null) {
            document.setField(domainFieldname, normalizedURL.getHost());
          }
          if (canonicalUrlFieldname != null) {
            document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
          }
          log.debug("{}", document);
        } catch (MalformedURLException | URISyntaxException e) {
          log.warn("cannot get the normalized url for '{}' due to ", url, e);
        }
      }
    }
    super.processAdd(command);
  }

  /**
   * Gets a canonical form of the URL for use as main URL
   *
   * @param url The input url
   * @return The URL object representing the canonical URL
   */
  public URL getCanonicalUrl(URL url) throws MalformedURLException {
    // NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
    String urlString = url.toString();
    String lps = landingPageSuffix(url);
    return new URL(urlString.replaceFirst("/" + lps + "$", "/"));
  }

  /**
   * Calculates the length of the URL in characters
   *
   * @param url The input URL
   * @return the length of the URL
   */
  public int length(URL url) {
    return url.toString().length();
  }

  /**
   * Calculates the number of path levels in the given URL
   *
   * @param url The input URL
   * @return the number of levels, where a top-level URL is 0
   */
  public int levels(URL url) {
    // Remove any trailing slashes for the purpose of level counting
    String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
    int levels = 0;
    for (int i = 0; i < path.length(); i++) {
      if (path.charAt(i) == '/') {
        levels++;
      }
    }
    return levels;
  }

  /**
   * Calculates whether a URL is a top level page
   *
   * @param url The input URL
   * @return true if page is a top level page
   */
  public boolean isTopLevelPage(URL url) {
    // Remove any trailing slashes for the purpose of level counting
    String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
    return path.length() == 0 && url.getQuery() == null;
  }

  /**
   * Calculates whether the URL is a landing page or not
   *
   * @param url The input URL
   * @return true if URL represents a landing page (index page)
   */
  public boolean isLandingPage(URL url) {
    if (url.getQuery() != null) {
      return false;
    } else {
      return !Objects.equals(landingPageSuffix(url), "");
    }
  }

  public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
    return new URI(url).normalize().toURL();
  }

  public boolean isEnabled() {
    return enabled;
  }

  public void setEnabled(boolean enabled) {
    this.enabled = enabled;
  }

  private String landingPageSuffix(URL url) {
    String path = url.getPath().toLowerCase(Locale.ROOT);
    for (String suffix : landingPageSuffixes) {
      if (path.endsWith(suffix)) {
        return suffix;
      }
    }
    return "";
  }

  private String getPathWithoutSuffix(URL url) {
    return url.getPath().toLowerCase(Locale.ROOT).replaceFirst(landingPageSuffix(url) + "$", "");
  }
}