org.archive.modules.canonicalize.FixupQueryString Maven / Gradle / Ivy

Go to download
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.canonicalize;

/**
 * Strip any trailing question mark.
 * @author stack
 * @version $Date$, $Revision$
 */
public class FixupQueryString
extends BaseRule {

    private static final long serialVersionUID = 3L;

    /*
    private static final String DESCRIPTION =
        "Fixup the question mark that leads off the query string. " +
        "This rule returns 'http://www.archive.org/index.html' if passed" +
        " 'http://www.archive.org/index.html?'.  It will also strip '?&'" +
        " if '?&' is all that comprises the query string.  Also strips" +
        " extraneous leading '&': Returns 'http://archive.org/index.html?x=y" +
        " if passed 'http://archive.org/index.html?&x=y." +
        " Will also strip '&' if last thing in query string." +
        " Operates on all schemes.  This is a good rule to run toward the" +
        " end of canonicalization processing.";
        */
    
    public FixupQueryString() {
    }

    public String canonicalize(String url) {
        if (url == null || url.length() <= 0) {
            return url;
        }
        
        int index = url.lastIndexOf('?');
        if (index > 0) {
            if (index == (url.length() - 1)) {
                // '?' is last char in url.  Strip it.
                url = url.substring(0, url.length() - 1);
            } else if (url.charAt(index + 1) == '&') {
                // Next char is '&'. Strip it.
                if (url.length() == (index + 2)) {
                    // Then url ends with '?&'.  Strip them.
                    url = url.substring(0, url.length() - 2);
                } else {
                    // The '&' is redundant.  Strip it.
                    url = url.substring(0, index + 1) +
                    url.substring(index + 2);
                }
            } else if (url.charAt(url.length() - 1) == '&') {
                // If we have a lone '&' on end of query str,
                // strip it.
                url = url.substring(0, url.length() - 1);
            }
        }
        return url;
    }
}