java.com.generationjava.scrape.HtmlScraper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gj-scrape Show documentation
Show all versions of gj-scrape Show documentation
Simple scraping component of GenJava-Core.
/*
* Copyright (c) 2003, Henri Yandell
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the
* following conditions are met:
*
* + Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* + Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* + Neither the name of Genjava-Core nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package com.generationjava.scrape;
import java.io.InputStream;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.commons.lang.StringUtils;
import com.generationjava.web.HtmlW;
import com.generationjava.web.XmlW;
/// simple system in. need to now consider the move methods.
/// then need to make sure the get method obeys internal
/// rules. ie) if given td.a and u find td.td, should then look
/// at that td? I guess so. Just as long as a tag doesn't span
/// another that it's not allowed to.
// need to be able to say: move to b.class=blah and b=foo
/// Add a moveback(String tag) method
/// Add a getContent(int i) method to get untagged blocks of text
/**
HtmlScraper scraper = new HtmlScraper();
scraper.scrape(UrlW.openUrlStream("http://www.yandell.org"));
System.err.println(scraper.get("TITLE").trim());
System.err.println(scraper.get("HEAD.TITLE").trim());
System.err.println(scraper.get("HTML.HEAD.TITLE").trim());
System.err.println(scraper.get("HTML.HEAD.META[name]"));
while(scraper.move("LI")) {
System.err.println(scraper.get("A[HREF]"));
}
System.err.println(scraper.get("LI.I"));
*/
public class HtmlScraper {
// The entire page
private String page;
// Lowercase version of the entire page
private String lcPage;
// Position at which the string-scraper has reached
private int currentIndex;
public HtmlScraper() {
}
public int getIndex() {
return this.currentIndex;
}
public void scrape(String text) {
if(text.startsWith("http://")) {
throw new RuntimeException("Text starts with http://. This could be bad. ");
}
this.page = text;
this.lcPage = text.toLowerCase();
reset();
}
public HtmlScraper scrapeTag(String tag) {
HtmlScraper scraper = new HtmlScraper();
scraper.scrape(get(tag));
return scraper;
}
/**
* Move back to the start of the page.
*/
public void reset() {
this.currentIndex = 0;
}
/**
* Move to the specified tag.
*/
// This needs to be case-insensitive
public boolean move(String tag) {
int idx = XmlW.getIndexOpeningTag(lcPage, tag.toLowerCase(), this.currentIndex + 1);
if(idx == -1) {
return false;
} else {
idx++;
this.currentIndex = idx;
return true;
}
}
/**
* Helper method.
* Move a number of tags.
*/
public boolean move(String tag, int sz) {
for(int i=0; i< sz; i++) {
if(!move(tag)) {
return false;
}
}
return true;
}
// finds any tag with name=value attribute. Need to be able
// to specify the tag really, and also the same for a value.
// really we need a generic search method? :)
// get this value, is it this. if not, find next.
// Tricky. name needs to be case-insensitive, value needs
// to not be.
public boolean moveToTagWith(String name, String value) {
int idx = this.page.indexOf(name+"=\""+value+"\"", this.currentIndex);
if(idx == -1) {
idx = this.page.indexOf(name+"="+value, this.currentIndex);
}
if(idx == -1) {
idx = this.page.indexOf(name+"='"+value+"'", this.currentIndex);
}
if(idx == -1) {
return false;
} else {
idx = this.page.lastIndexOf("<", idx);
if(idx == this.currentIndex) {
// then we just found the previous one, so we need to bump things
// along a little bit
int nextTag = this.page.indexOf("<", this.currentIndex + 1);
if(nextTag == -1) {
return false;
} else {
this.currentIndex = nextTag;
boolean found = moveToTagWith(name, value);
if(!found) {
// reset the currentIndex
this.currentIndex = idx;
}
return found;
}
}
this.currentIndex = idx;
return true;
}
}
// moveTo a[href], www.yandell.org
public boolean moveTo(String get, String value) {
HtmlScraper scraper = new HtmlScraper();
scraper.scrape(this.page.substring(this.currentIndex));
int count = 1;
while(true) {
boolean found = scraper.move(get);
if(!found) {
return false;
}
String chunk = scraper.get(get);
if( (chunk == null) || (chunk.equals("")) ) {
return false;
}
if(chunk.equals(value)) {
move(get, count);
break;
} else {
count++;
}
}
return true;
}
/**
* Move to a specified piece of text.
*/
public boolean moveToText(String text) {
int idx = this.page.indexOf(text, this.currentIndex);
if(idx == -1) {
return false;
} else {
this.currentIndex = idx;
return true;
}
}
/**
* Move to a specified comment. The parameter should
* not contain the HTML comment syntax.
*/
public boolean moveToComment(String comment) {
int idx = this.page.indexOf(comment, this.currentIndex);
if(idx == -1) {
return false;
} else {
idx = page.lastIndexOf("