
us.codecraft.webmagic.selector.HtmlNode Maven / Gradle / Ivy
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
/**
* @author [email protected]
*/
public class HtmlNode extends AbstractSelectable {
private final List elements;
public HtmlNode(List elements) {
this.elements = elements;
}
public HtmlNode() {
elements = null;
}
protected List getElements() {
return elements;
}
@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());
}
@Override
public Selectable links() {
return xpath("//a/@href");
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector);
}
@Override
public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) {
return selectElements((BaseElementSelector) selector);
}
return selectList(selector, getSourceTexts());
}
@Override
public Selectable select(Selector selector) {
return selectList(selector);
}
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List resultElements = new ArrayList();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List resultStrings = new ArrayList();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @param element element
*/
private Element checkElementAndConvert(ListIterator elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
return selectElements(cssSelector);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
return selectElements(cssSelector);
}
@Override
public List nodes() {
List selectables = new ArrayList();
for (Element element : getElements()) {
List childElements = new ArrayList(1);
childElements.add(element);
selectables.add(new HtmlNode(childElements));
}
return selectables;
}
@Override
protected List getSourceTexts() {
List sourceTexts = new ArrayList(getElements().size());
for (Element element : getElements()) {
sourceTexts.add(element.toString());
}
return sourceTexts;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy