All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.forms.HTMLForm Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.forms;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang.StringUtils;

/**
 * Simple representation of a discovered HTML Form. 
 * 
 * @author gojomo
 */
public class HTMLForm {
    public class FormInput {
        public String type;
        public String name;
        public String value;
        public boolean checked = false;
        @Override
        public String toString() {
            String str = "input[@type='" + type+"'][@name='" + name + "'][@value='" + value + "']";
            if (checked) {
                str = str + "[@checked]";
            }
            return str;
        } 
    }

    protected String method;
    protected String action;
    protected String enctype;

    protected List allInputs = new ArrayList();
    protected List candidateUsernameInputs = new ArrayList();
    protected List candidatePasswordInputs = new ArrayList();

    /**
     * Add a discovered INPUT, tracking it as potential 
     * username/password receiver. 
     * @param type
     * @param name
     * @param value
     * @param checked true if "checked" attribute is present (for radio buttons and checkboxes)
     */
    public void addField(String type, String name, String value, boolean checked) {
        FormInput input = new FormInput();
        input.type = type;
        
        if (isMultipleFormSubmitInputs(type)) {
            return;
        }
             
        // default input type is text per html standard
        if (input.type == null) {
            input.type = "text";
        }
        input.name = name;
        input.value = value; 
        input.checked = checked;
        allInputs.add(input);

        if("text".equalsIgnoreCase(input.type) || "email".equalsIgnoreCase(input.type)) {
            candidateUsernameInputs.add(input);
        } else if ("password".equalsIgnoreCase(type)) {
            candidatePasswordInputs.add(input);
        }
    }
    
    public boolean isMultipleFormSubmitInputs(String type) {
        if (type != null && !type.toLowerCase().equals("submit"))
            return false;

        for (FormInput input : allInputs) {
            if (input.type.toLowerCase().equals("submit")) {
                return true;
            }
        }

        return false;
    }    

    /**
     * Add a discovered INPUT, tracking it as potential 
     * username/password receiver. 
     * @param type
     * @param name
     * @param value
     */
    public void addField(String type, String name, String value) {
        addField(type, name, value, false);
    }

    public void setMethod(String method) {
        this.method = method; 
    }

    public String getAction() {
        return action;
    }

    public void setAction(String action) {
        this.action = action;
    }

    public String getEnctype() {
        return enctype;
    }

    public void setEnctype(String enctype) {
        this.enctype = enctype;
    }

    /**
     * For now, we consider a POST form with only 1 password
     * field and 1 potential username field (type text or email)
     * to be a likely login form.
     * 
     * @return boolean likely login form
     */
    public boolean seemsLoginForm() {
        return "post".equalsIgnoreCase(method) 
                && candidatePasswordInputs.size() == 1
                && presumedUsernameInput() != null;
    }

    protected FormInput presumedUsernameInput() {
        if (candidateUsernameInputs.size() < 1) {
            return null;
        } else if (candidateUsernameInputs.size() == 1) {
            return candidateUsernameInputs.get(0);
        } else {
            // more than one candidate; if there is exactly one whose name
            // contains the string "username", choose that one
            FormInput choice = null;
            for (FormInput input: candidateUsernameInputs) {
                if (input.name != null && input.name.toLowerCase().indexOf("username") != -1) {
                    if (choice == null) {
                        choice = input;
                    } else {
                        return null;
                    }
                }
            }
            return choice;
        }
    }

    public static class NameValue {
        public String name, value;
        public NameValue(String name, String value) {
            this.name = name;
            this.value = value;
        }
    }

    public LinkedList formData(String username, String password) {
        LinkedList nameVals = new LinkedList();
        for (FormInput input : allInputs) {
            if (input == presumedUsernameInput()) {
                nameVals.add(new NameValue(input.name, username));
            } else if (input == candidatePasswordInputs.get(0)) {
                nameVals.add(new NameValue(input.name, password));
            } else if (!"radio".equalsIgnoreCase(input.type)
                    && !"checkbox".equals(input.type) || input.checked) {
                nameVals.add(new NameValue(StringUtils.isEmpty(input.name) ? ""
                        : input.name, StringUtils.isEmpty(input.value) ? ""
                        : input.value));
            }
        }
        return nameVals;
    }

    public String toString() {
        StringBuilder sb = new StringBuilder(); 
        sb.append(method);
        sb.append(" ");
        sb.append(action); 
        for(FormInput input : allInputs) {
            sb.append("\n  ");
            sb.append(input.type);
            sb.append(" ");
            sb.append(input.name);
            sb.append(" ");
            sb.append(input.value);
        }
        return sb.toString();
    }
    
    /**
     * Provide abbreviated annotation, of the form...
     *  "form:Phhpt"
     * 
     * ...where the first capital letter indicates submission
     * type, G[ET] or P[OST], and following lowercase letters
     * types of inputs in order, by their first letter. 
     * 
     * @return String suitable for brief crawl.log annotation
     */
    public String asAnnotation() {
        StringBuilder sb = new StringBuilder(); 
        sb.append("form:");
        sb.append(Character.toUpperCase(method.charAt(0)));
        for(FormInput input : allInputs) {
            sb.append(Character.toLowerCase(input.type.charAt(0)));
        }
        return sb.toString();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy