java.com.generationjava.scrape.HtmlScraper Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of gj-scrape Show documentation

Simple scraping component of GenJava-Core.

There is a newer version: 2.0.1

/*
 * Copyright (c) 2003, Henri Yandell
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or 
 * without modification, are permitted provided that the 
 * following conditions are met:
 * 
 * + Redistributions of source code must retain the above copyright notice, 
 *   this list of conditions and the following disclaimer.
 * 
 * + Redistributions in binary form must reproduce the above copyright notice, 
 *   this list of conditions and the following disclaimer in the documentation 
 *   and/or other materials provided with the distribution.
 * 
 * + Neither the name of Genjava-Core nor the names of its contributors 
 *   may be used to endorse or promote products derived from this software 
 *   without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE.
 */
package com.generationjava.scrape;

import java.io.InputStream;
import java.io.IOException;
import java.util.LinkedList;

import org.apache.commons.lang.StringUtils;
import com.generationjava.web.HtmlW;
import com.generationjava.web.XmlW;

/// simple system in. need to now consider the move methods.
/// then need to make sure the get method obeys internal 
/// rules. ie) if given td.a and u find td.td, should then look 
/// at that td? I guess so. Just as long as a tag doesn't span 
/// another that it's not allowed to.

// need to be able to say: move to b.class=blah and b=foo

/// Add a moveback(String tag) method
/// Add a getContent(int i) method to get untagged blocks of text
    /**
        HtmlScraper scraper = new HtmlScraper();
        scraper.scrape(UrlW.openUrlStream("http://www.yandell.org"));
        System.err.println(scraper.get("TITLE").trim());
        System.err.println(scraper.get("HEAD.TITLE").trim());
        System.err.println(scraper.get("HTML.HEAD.TITLE").trim());
        System.err.println(scraper.get("HTML.HEAD.META[name]"));
        while(scraper.move("LI")) {
            System.err.println(scraper.get("A[HREF]"));
        }
        System.err.println(scraper.get("LI.I"));
    */
public class HtmlScraper {

    // The entire page
    private String page;
    // Lowercase version of the entire page
    private String lcPage;
    // Position at which the string-scraper has reached
    private int currentIndex;

    public HtmlScraper() {
    }

    public int getIndex() {
        return this.currentIndex;
    }

    public void scrape(String text) {
        if(text.startsWith("http://")) {
            throw new RuntimeException("Text starts with http://. This could be bad. ");
        }
        this.page = text;
        this.lcPage = text.toLowerCase();
        reset();
    }

    public HtmlScraper scrapeTag(String tag) {
        HtmlScraper scraper = new HtmlScraper();
        scraper.scrape(get(tag));
        return scraper;
    }

    /**
     * Move back to the start of the page.
     */
    public void reset() {
        this.currentIndex = 0;
    }

    /**
     * Move to the specified tag. 
     */
     // This needs to be case-insensitive
    public boolean move(String tag) {
        int idx = XmlW.getIndexOpeningTag(lcPage, tag.toLowerCase(), this.currentIndex + 1);
        if(idx == -1) {
            return false;
        } else {
            idx++;
            this.currentIndex = idx;
            return true;
        }
    }

    /**
     * Helper method.
     * Move a number of tags.
     */
    public boolean move(String tag, int sz) {
        for(int i=0; i< sz; i++) {
            if(!move(tag)) {
                return false;
            }
        }
        return true;
    }

    // finds any tag with name=value attribute. Need to be able 
    // to specify the tag really, and also the same for a value.
    // really we need a generic search method? :)
    // get this value, is it this. if not, find next.

    // Tricky. name needs to be case-insensitive, value needs 
    // to not be.
    public boolean moveToTagWith(String name, String value) {
        int idx = this.page.indexOf(name+"=\""+value+"\"", this.currentIndex);
        if(idx == -1) {
            idx = this.page.indexOf(name+"="+value, this.currentIndex);
        }
        if(idx == -1) {
            idx = this.page.indexOf(name+"='"+value+"'", this.currentIndex);
        }
        if(idx == -1) {
            return false;
        } else {
            idx = this.page.lastIndexOf("<", idx);
            if(idx == this.currentIndex) {
                // then we just found the previous one, so we need to bump things 
                // along a little bit
                int nextTag = this.page.indexOf("<", this.currentIndex + 1);
                if(nextTag == -1) {
                    return false;
                } else {
                    this.currentIndex = nextTag;
                    boolean found = moveToTagWith(name, value);
                    if(!found) {
                        // reset the currentIndex
                        this.currentIndex = idx;
                    }
                    return found;
                }
            }
            this.currentIndex = idx;
            return true;
        }
    }

    //   moveTo a[href], www.yandell.org
    public boolean moveTo(String get, String value) {
        HtmlScraper scraper = new HtmlScraper();
        scraper.scrape(this.page.substring(this.currentIndex));
        int count = 1;
        while(true) {
            boolean found = scraper.move(get);
            if(!found) {
                return false;
            }
            String chunk = scraper.get(get);
            if( (chunk == null) || (chunk.equals("")) ) {
                return false;
            }
            if(chunk.equals(value)) {
                move(get, count);
                break;
            } else {
                count++;
            }
        }
        return true;
    }

    /**
     * Move to a specified piece of text. 
     */
    public boolean moveToText(String text) {
        int idx = this.page.indexOf(text, this.currentIndex);
        if(idx == -1) {
            return false;
        } else {
            this.currentIndex = idx;
            return true;
        }
    }

    /**
     * Move to a specified comment. The parameter should 
     * not contain the HTML comment syntax.
     */
    public boolean moveToComment(String comment) {
        int idx = this.page.indexOf(comment, this.currentIndex);
        if(idx == -1) {
            return false;
        } else {
            idx = page.lastIndexOf("

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api