
prerna.engine.impl.web.WebScrapeEngine Maven / Gradle / Ivy
The newest version!
package prerna.engine.impl.web;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import prerna.engine.api.IDatabaseEngine;
import prerna.engine.impl.json.JsonAPIEngine;
import prerna.util.Constants;
import prerna.util.Utility;
public class WebScrapeEngine extends JsonAPIEngine {
private static final Logger logger = LogManager.getLogger(WebScrapeEngine.class);
// override execute query and get enginetype etc.
// gets the URL - Mostly with the parameters
// specifies if it is get or post
// gets the stream and loads it into jsoup
// gets the selector and runs it
// the selector is cardinality (which specific table to pick) from the page
// table_number = 3
// or
// table_id = abcd
// or
// table_class = number = pick it based on class
// or
// table_attribute = number
public static final String TABLE_NUMBER = "table_number";
public static final String TABLE_ID = "table_id";
public static final String TABLE_CLASS = "table_class";
@Override
public DATABASE_TYPE getDatabaseType() {
return IDatabaseEngine.DATABASE_TYPE.WEB;
}
@Override
public Object execQuery(String query)
{
Hashtable retHash = new Hashtable();
try {
String url = smssProp.getProperty(INPUT_URL);
String method = "get";
if(smssProp.containsKey(INPUT_METHOD))
method = smssProp.getProperty(INPUT_METHOD);
InputStream is = null;
if(method.equalsIgnoreCase("get"))
is = doGetI(url);
else
is = doPostI(new Hashtable());
Document doc = Jsoup.parse(is, "UTF-8", "");
Element thisTable = null;
if(smssProp.containsKey(TABLE_CLASS))
{
// this is a get by class
String className = smssProp.getProperty(TABLE_CLASS);
int tableNum = Integer.parseInt(smssProp.getProperty(TABLE_NUMBER).trim());
Elements allTables = doc.getElementsByClass(className);
thisTable = allTables.get(tableNum);
}
else if(smssProp.containsKey(TABLE_ID))
{
// get the table by id
String id = smssProp.getProperty(TABLE_ID);
thisTable = doc.getElementById(id);
}
else
{
// this is just table by tag
int tableNum = Integer.parseInt(smssProp.get(TABLE_NUMBER) + "");
Elements allTables = doc.getElementsByTag("table");
thisTable = allTables.get(tableNum);
}
// now I need to collect the header and rows and then return it
String[] headers = getHeaders(url, smssProp);
Integer[] selectedIndex = null;
String[] selectedHeaders = null;
// add filters
if (query != null) {
// specified in query so only send specific columns
List headersList = Arrays.asList(headers);
String[] selectCols = query.split(";");
selectedIndex = new Integer[selectCols.length];
selectedHeaders = new String[selectCols.length];
// get indexes of columns the user has selected
for (int i = 0; i < selectCols.length; i++) {
String[] headerSplit = selectCols[i].split("=");
logger.debug(Arrays.asList(headerSplit));
String header = headerSplit[0].trim();
selectedIndex[i] = headersList.indexOf(header);
selectedHeaders[i] = header;
}
} else {
throw new IllegalArgumentException("Must select one or more values from table headers!");
}
List values = collectValues(thisTable.getElementsByTag("tr"), headers, Arrays.asList(selectedIndex));
// add filtered headers if they exist
if (selectedHeaders != null) {
retHash.put("HEADERS", selectedHeaders);
} else {
retHash.put("HEADERS", headers);
}
String [] types = getWebTypes(values);
retHash.put("ROWS", values);
retHash.put("TYPES", types);
} catch (NumberFormatException e) {
logger.error(Constants.STACKTRACE, e);
} catch (IOException ioe) {
logger.error(Constants.STACKTRACE, ioe);
}
return retHash;
}
public String[] getHeaders(String url, Map aliasMap) {
String[] headers = null;
try {
String method = "get";
if (aliasMap.containsKey(INPUT_METHOD))
method = (String) aliasMap.get(INPUT_METHOD);
InputStream is = null;
if (method.equalsIgnoreCase("get"))
is = doGetI(url);
else
is = doPostI(new Hashtable());
Document doc = Jsoup.parse(is, "UTF-8", "");
Element thisTable = null;
if (aliasMap.containsKey(TABLE_CLASS)) {
// this is a get by class
String className = (String) aliasMap.get(TABLE_CLASS);
int tableNum = Integer.parseInt(aliasMap.get(TABLE_NUMBER) + "");
Elements allTables = doc.getElementsByClass(className);
thisTable = allTables.get(tableNum);
} else if (aliasMap.containsKey(TABLE_ID)) {
// get the table by id
String id = (String) aliasMap.get(TABLE_ID);
thisTable = doc.getElementById(id);
} else {
// this is just table by tag
int tableNum = Integer.parseInt(aliasMap.get(TABLE_NUMBER) + "");
Elements allTables = doc.getElementsByTag("table");
thisTable = allTables.get(tableNum);
}
// now I need to collect the header and rows and then return it
// see if there are rows
Elements allRows = thisTable.getElementsByTag("tr");
if (allRows != null && !allRows.isEmpty()) {
Elements firstRow = allRows.get(0).getElementsByTag("th");
headers = collectHeaders(firstRow);
}
} catch (Exception ex) {
logger.error(Constants.STACKTRACE, ex);
}
return headers;
}
public int getNumTables(String url, HashMap aliasMap) {
int tables = 0;
try {
String method = "get";
if (aliasMap.containsKey(INPUT_METHOD))
method = (String) aliasMap.get(INPUT_METHOD);
InputStream is = null;
if (method.equalsIgnoreCase("get"))
is = doGetI(url);
else
is = doPostI(new Hashtable());
Document doc = Jsoup.parse(is, "UTF-8", "");
tables = doc.getElementsByTag("table").size();
} catch (Exception ex) {
logger.error(Constants.STACKTRACE, ex);
}
return tables;
}
public String [] collectHeaders(Elements headers)
{
String [] headerList = new String[headers.size()];
for(int headerIndex = 0;headerIndex < headers.size();headerIndex++)
headerList[headerIndex] = Utility.makeAlphaNumeric(headers.get(headerIndex).text());
return headerList;
}
public List collectValues(Elements values, String [] headerList, List selectedHeaderIndicies)
{
int rowIndex = 0;
// need to get the td inside of each tr
// and then paint it
// I also need to account for table spans
// along with row span.. need to accomodate for column spans as well
// basically we fill the same value for each of the pieces
List valueList = new Vector();
// start from 1 since the first one is the header
Hashtable rowColValue = new Hashtable();
Hashtable rowSpanCount = new Hashtable();
for(rowIndex = 1;rowIndex < values.size();rowIndex++)
{
Element thisRow = values.get(rowIndex);
// get the TD on this row
// some of these are TH still.. thank you so much
Elements cols = thisRow.children();
// get the attribute to see
// I need to keep the index
// and I need to keep for how much I need to do that
String [] oneRow = new String[selectedHeaderIndicies.size()];
int colIndex = 0;
//int colspancount = -1;
//String colSpanData = null;
boolean dataFilled = false;
for(Integer headerIndex : selectedHeaderIndicies)
{
// colIndex = selectedHeaderIndicies
if(rowColValue.containsKey(headerIndex))
{
// need to use this value for the header
oneRow[headerIndex] = (String)rowColValue.get(headerIndex);
int numrows = (Integer)rowSpanCount.get(headerIndex);
numrows--;
if(numrows == 0)
{
rowColValue.remove(headerIndex);
rowSpanCount.remove(headerIndex);
}
else
rowSpanCount.put(headerIndex, numrows);
// move the header forward
//headerIndex++;
}
/*if(headerIndex < colspancount)
{
// when a actual data comes need to see what is coming
oneRow[headerIndex] = colSpanData;
}*/
else if(cols.size() > headerIndex)
{
dataFilled = true;
Element thisCol = cols.get(headerIndex);
oneRow[colIndex] = thisCol.text();
colIndex++;
// get the attribute to see if it a row span
String rowspan = thisCol.attr("rowspan");
if(rowspan != null && rowspan.length() > 0)
{
int numrows = Integer.parseInt(rowspan);
// remove for this row we just saw
numrows--;
rowColValue.put(headerIndex, thisCol.text());
rowSpanCount.put(headerIndex, numrows);
}
// get the attribute to see if it a row span
/*String colspan = thisCol.attr("colspan");
if(colspan != null && colspan.length() > 0)
{
colspancount = Integer.parseInt(colspan);
}*/
}
}
if(dataFilled)
valueList.add(oneRow);
}
/* while(rowIndex < values.size())
{
String [] oneRow = new String[headerList.length];
for(int headerIndex = 0;headerIndex < headerList.length;headerIndex++)
{
oneRow[headerIndex] = values.get(rowIndex).text();
rowIndex++;
}
valueList.add(oneRow);
}
*/ //valueList.set(0, headerList);
return valueList;
}
public String [] getWebTypes(List values)
{
// try it with 10
// for every column.. I need to see which are the possible ones
// and then select it
int rowLimit = 10;
if(values.size() <= 10)
rowLimit = values.size();
String [] oneRow = values.get(0);
String [] typeList = new String[oneRow.length];
int [] stringCount = new int[oneRow.length];
int [] intCount = new int[oneRow.length];
for(int rowIndex = 0;rowIndex < rowLimit;rowIndex++)
{
oneRow = values.get(rowIndex);
for(int colIndex = 0;colIndex < oneRow.length;colIndex++)
{
if(rowIndex == 0)
{
stringCount[colIndex] = 0;
intCount[colIndex] = 0;
}
String thisValue = oneRow[colIndex];
if(NumberUtils.isDigits(thisValue))
intCount[colIndex] = intCount[colIndex] + 1;
else
stringCount[colIndex] = stringCount[colIndex] + 1;
}
}
for(int colIndex = 0;colIndex < intCount.length;colIndex++)
{
if(stringCount[colIndex] > intCount[colIndex])
typeList [colIndex] = "String";
else
typeList[colIndex] = "int";
}
return typeList;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy