org.apache.solr.update.processor.URLClassifyProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Show all versions of solr-core Show documentation
Apache Solr (module: core)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Locale;
import java.util.Objects;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Update processor which examines a URL and outputs to various other fields characteristics of that
* URL, including length, number of path levels, whether it is a top level URL (levels==0), whether
* it looks like a landing/index page, a canonical representation of the URL (e.g. stripping
* index.html), the domain and path parts of the URL etc.
*
* This processor is intended used in connection with processing web resources, and helping to
* produce values which may be used for boosting or filtering later.
*
*
In the example configuration below, we construct a custom updateRequestProcessorChain
*
and then instruct the /update
requesthandler to use it for every incoming
* document.
*
*
* <updateRequestProcessorChain name="urlProcessor">
* <processor class="org.apache.solr.update.processor.URLClassifyProcessorFactory">
* <bool name="enabled">true</bool>
* <str name="inputField">id</str>
* <str name="domainOutputField">hostname</str>
* </processor>
* <processor class="solr.RunUpdateProcessorFactory" />
* </updateRequestProcessorChain>
*
* <requestHandler name="/update" class="solr.UpdateRequestHandler">
* <lst name="defaults">
* <str name="update.chain">urlProcessor</str>
* </lst>
* </requestHandler>
*
*
* Then, at index time, Solr will look at the id
field value and extract it's domain
* portion into a new hostname
field. By default, the following fields will also be
* added:
*
*
* - url_length
*
- url_levels
*
- url_toplevel
*
- url_landingpage
*
*
* For example, adding the following document
*
*
* { "id":"http://wwww.mydomain.com/subpath/document.html" }
*
*
* will result in this document in Solr:
*
*
* {
* "id":"http://wwww.mydomain.com/subpath/document.html",
* "url_length":46,
* "url_levels":2,
* "url_toplevel":0,
* "url_landingpage":0,
* "hostname":"wwww.mydomain.com",
* "_version_":1603193062117343232}]
* }
*
*/
public class URLClassifyProcessor extends UpdateRequestProcessor {
private static final String INPUT_FIELD_PARAM = "inputField";
private static final String OUTPUT_LENGTH_FIELD_PARAM = "lengthOutputField";
private static final String OUTPUT_LEVELS_FIELD_PARAM = "levelsOutputField";
private static final String OUTPUT_TOPLEVEL_FIELD_PARAM = "toplevelOutputField";
private static final String OUTPUT_LANDINGPAGE_FIELD_PARAM = "landingpageOutputField";
private static final String OUTPUT_DOMAIN_FIELD_PARAM = "domainOutputField";
private static final String OUTPUT_CANONICALURL_FIELD_PARAM = "canonicalUrlOutputField";
private static final String DEFAULT_URL_FIELDNAME = "url";
private static final String DEFAULT_LENGTH_FIELDNAME = "url_length";
private static final String DEFAULT_LEVELS_FIELDNAME = "url_levels";
private static final String DEFAULT_TOPLEVEL_FIELDNAME = "url_toplevel";
private static final String DEFAULT_LANDINGPAGE_FIELDNAME = "url_landingpage";
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private boolean enabled = true;
private String urlFieldname = DEFAULT_URL_FIELDNAME;
private String lengthFieldname = DEFAULT_LENGTH_FIELDNAME;
private String levelsFieldname = DEFAULT_LEVELS_FIELDNAME;
private String toplevelpageFieldname = DEFAULT_TOPLEVEL_FIELDNAME;
private String landingpageFieldname = DEFAULT_LANDINGPAGE_FIELDNAME;
private String domainFieldname = null;
private String canonicalUrlFieldname = null;
private static final String[] landingPageSuffixes = {
"/",
"index.html",
"index.htm",
"index.phtml",
"index.shtml",
"index.xml",
"index.php",
"index.asp",
"index.aspx",
"welcome.html",
"welcome.htm",
"welcome.phtml",
"welcome.shtml",
"welcome.xml",
"welcome.php",
"welcome.asp",
"welcome.aspx"
};
public URLClassifyProcessor(
SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
super(nextProcessor);
this.initParameters(parameters);
}
private void initParameters(SolrParams parameters) {
if (parameters != null) {
this.setEnabled(parameters.getBool("enabled", true));
this.urlFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_URL_FIELDNAME);
this.lengthFieldname = parameters.get(OUTPUT_LENGTH_FIELD_PARAM, DEFAULT_LENGTH_FIELDNAME);
this.levelsFieldname = parameters.get(OUTPUT_LEVELS_FIELD_PARAM, DEFAULT_LEVELS_FIELDNAME);
this.toplevelpageFieldname =
parameters.get(OUTPUT_TOPLEVEL_FIELD_PARAM, DEFAULT_TOPLEVEL_FIELDNAME);
this.landingpageFieldname =
parameters.get(OUTPUT_LANDINGPAGE_FIELD_PARAM, DEFAULT_LANDINGPAGE_FIELDNAME);
this.domainFieldname = parameters.get(OUTPUT_DOMAIN_FIELD_PARAM);
this.canonicalUrlFieldname = parameters.get(OUTPUT_CANONICALURL_FIELD_PARAM);
}
}
@Override
public void processAdd(AddUpdateCommand command) throws IOException {
if (isEnabled()) {
SolrInputDocument document = command.getSolrInputDocument();
if (document.containsKey(urlFieldname)) {
String url = (String) document.getFieldValue(urlFieldname);
try {
URL normalizedURL = getNormalizedURL(url);
document.setField(lengthFieldname, length(normalizedURL));
document.setField(levelsFieldname, levels(normalizedURL));
document.setField(toplevelpageFieldname, isTopLevelPage(normalizedURL) ? 1 : 0);
document.setField(landingpageFieldname, isLandingPage(normalizedURL) ? 1 : 0);
if (domainFieldname != null) {
document.setField(domainFieldname, normalizedURL.getHost());
}
if (canonicalUrlFieldname != null) {
document.setField(canonicalUrlFieldname, getCanonicalUrl(normalizedURL));
}
log.debug("{}", document);
} catch (MalformedURLException | URISyntaxException e) {
log.warn("cannot get the normalized url for '{}' due to ", url, e);
}
}
}
super.processAdd(command);
}
/**
* Gets a canonical form of the URL for use as main URL
*
* @param url The input url
* @return The URL object representing the canonical URL
*/
public URL getCanonicalUrl(URL url) throws MalformedURLException {
// NOTE: Do we want to make sure this URL is normalized? (Christian thinks we should)
String urlString = url.toString();
String lps = landingPageSuffix(url);
return new URL(urlString.replaceFirst("/" + lps + "$", "/"));
}
/**
* Calculates the length of the URL in characters
*
* @param url The input URL
* @return the length of the URL
*/
public int length(URL url) {
return url.toString().length();
}
/**
* Calculates the number of path levels in the given URL
*
* @param url The input URL
* @return the number of levels, where a top-level URL is 0
*/
public int levels(URL url) {
// Remove any trailing slashes for the purpose of level counting
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
int levels = 0;
for (int i = 0; i < path.length(); i++) {
if (path.charAt(i) == '/') {
levels++;
}
}
return levels;
}
/**
* Calculates whether a URL is a top level page
*
* @param url The input URL
* @return true if page is a top level page
*/
public boolean isTopLevelPage(URL url) {
// Remove any trailing slashes for the purpose of level counting
String path = getPathWithoutSuffix(url).replaceAll("/+$", "");
return path.length() == 0 && url.getQuery() == null;
}
/**
* Calculates whether the URL is a landing page or not
*
* @param url The input URL
* @return true if URL represents a landing page (index page)
*/
public boolean isLandingPage(URL url) {
if (url.getQuery() != null) {
return false;
} else {
return !Objects.equals(landingPageSuffix(url), "");
}
}
public URL getNormalizedURL(String url) throws MalformedURLException, URISyntaxException {
return new URI(url).normalize().toURL();
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
private String landingPageSuffix(URL url) {
String path = url.getPath().toLowerCase(Locale.ROOT);
for (String suffix : landingPageSuffixes) {
if (path.endsWith(suffix)) {
return suffix;
}
}
return "";
}
private String getPathWithoutSuffix(URL url) {
return url.getPath().toLowerCase(Locale.ROOT).replaceFirst(landingPageSuffix(url) + "$", "");
}
}