
org.archive.modules.canonicalize.FixupQueryString Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.canonicalize;
/**
* Strip any trailing question mark.
* @author stack
* @version $Date$, $Revision$
*/
public class FixupQueryString
extends BaseRule {
private static final long serialVersionUID = 3L;
/*
private static final String DESCRIPTION =
"Fixup the question mark that leads off the query string. " +
"This rule returns 'http://www.archive.org/index.html' if passed" +
" 'http://www.archive.org/index.html?'. It will also strip '?&'" +
" if '?&' is all that comprises the query string. Also strips" +
" extraneous leading '&': Returns 'http://archive.org/index.html?x=y" +
" if passed 'http://archive.org/index.html?&x=y." +
" Will also strip '&' if last thing in query string." +
" Operates on all schemes. This is a good rule to run toward the" +
" end of canonicalization processing.";
*/
public FixupQueryString() {
}
public String canonicalize(String url) {
if (url == null || url.length() <= 0) {
return url;
}
int index = url.lastIndexOf('?');
if (index > 0) {
if (index == (url.length() - 1)) {
// '?' is last char in url. Strip it.
url = url.substring(0, url.length() - 1);
} else if (url.charAt(index + 1) == '&') {
// Next char is '&'. Strip it.
if (url.length() == (index + 2)) {
// Then url ends with '?&'. Strip them.
url = url.substring(0, url.length() - 2);
} else {
// The '&' is redundant. Strip it.
url = url.substring(0, index + 1) +
url.substring(index + 2);
}
} else if (url.charAt(url.length() - 1) == '&') {
// If we have a lone '&' on end of query str,
// strip it.
url = url.substring(0, url.length() - 1);
}
}
return url;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy