All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.univocity.api.entity.html.HtmlEntitySettings Maven / Gradle / Ivy

/*
 * Copyright (c) 2013 Univocity Software Pty Ltd. All rights reserved.
 * This file is subject to the terms and conditions defined in file
 * 'LICENSE.txt', which is part of this source code package.
 */

package com.univocity.api.entity.html;

import com.univocity.api.*;
import com.univocity.api.common.*;
import com.univocity.api.entity.html.builders.*;
import com.univocity.api.net.*;
import com.univocity.parsers.common.*;
import com.univocity.parsers.remote.*;

import java.util.*;

/**
 * A `HtmlEntitySettings` object manages the configuration of a HTML entity. An entity has a name and one or more
 * fields. These fields have paths to the elements that will have their data collected. In addition, a {@link HtmlParserListener}
 * can be associated with an entity to notify the user of actions made by the {@link HtmlParser}.
 *
 * @author Univocity Software Pty Ltd - [email protected]
 * @see HtmlEntityList
 * @see HtmlParser
 * @see HtmlParserListener
 * @see HtmlParsingContext
 */
public class HtmlEntitySettings extends RemoteEntitySettings implements FieldDefinition {

	Map fields = new LinkedHashMap();
	final List triggers = new ArrayList(1);
	private HtmlParserListener listener = null;

	/**
	 * Creates a new HTML entity configuration and associates it with the supplied name.
	 *
	 * @param entityName a string that identifies the HTMLEntity
	 */
	HtmlEntitySettings(String entityName, HtmlEntitySettings parentEntity) {
		super(entityName, createEmptyParserSettings(), parentEntity);
	}

	@Override
	public final PathStart addSilentField(String fieldName) {
		return newField(fieldName, false, true);
	}

	@Override
	public final PathStart addField(String fieldName) {
		return newField(fieldName, false, false);
	}

	@Override
	public final PathStart addPersistentField(String fieldName) {
		return newField(fieldName, true, false);
	}


	/**
	 * Used by the field adding methods. Creates a new {@link PathStart} based on the supplied options and adds it
	 * to the given field. Finally, returns the created {@link PathStart}.
	 *
	 * @param fieldName      the name that identifies the field
	 * @param persistent     if true, the field is persistent
	 * @param inhibitNewRows if true, the field is silent
	 *
	 * @return a {@link PathStart} to define a path
	 */
	private PathStart newField(String fieldName, boolean persistent, boolean inhibitNewRows) {
		FieldPath pathBuilder = Builder.build(FieldPath.class, fieldName, this, persistent, inhibitNewRows);
		addPathToField(fieldName, pathBuilder);
		return pathBuilder;
	}

	/**
	 * Used by {@link #newField(String, boolean, boolean)} to add a path to the field. It tries to get the
	 * field from the fields map. If it doesn't exist, it puts a new field into the map and associates it with the
	 * given path.
	 *
	 * @param fieldName the name identifies the field
	 * @param path      the path that will be associated with the field
	 */
	final void addPathToField(String fieldName, FieldPath path) {
		List paths = (List) fields.get(fieldName);
		if (paths == null) {
			paths = new ArrayList();
			fields.put(fieldName, paths);
		}
		paths.add(path);
	}


	/**
	 * Used by {@link #addRecordTrigger()}. When a record trigger is defined, it gets added to the
	 * {@link #triggers} list
	 *
	 * @param trigger the {@link RecordTrigger} that will be added to the list
	 */
	final void addTrigger(RecordTrigger trigger) {
		this.triggers.add(trigger);
	}

	/**
	 * Returns a {@link PartialPathStart} that is used to define a reusable path of HTML elements. Fields then can
	 * added to this path using {@link PartialPath#addField(String)} and others, which associates the field with this entity.
	 *
	 * Example:
	 *
	 * ```java
	 * HtmlEntityList entityList = new HtmlEntityList();
	 * HtmlEntitySettings items = entityList.configureEntity("items");
	 * PartialPath path = items.newPath()
	 *     .match("table").id("productsTable")
	 *     .match("td").match("div").classes("productContainer");
	 *
	 * //uses the path to add new fields to it and further element matching rules from the initial, common path.
	 * path.addField("name").match("span").classes("prodName", "prodNameTro").getText();
	 * path.addField("URL").match("a").childOf("div").classes("productPadding").getAttribute("href")
	 * ```
	 *
	 * @return a {@link PartialPathStart} to specify the path of HTML elements
	 */
	public final PartialPathStart newPath() {
		return Builder.build(PartialPathStart.class, this);
	}

	/**
	 * Returns a {@link GroupStart} that allows for a {@link Group} to be defined. A {@link Group} demarcates a section
	 * of the HTML input that is allowed to be parsed. {@link FieldPath}s created from a group will only be executed inside
	 * this defined area, ignoring any HTML that exists outside of it. For example, say you wanted to extract
	 * the "hello" and "howdy" words from the following HTML:
	 *
	 * ```html
	 * 
*

hello

*
*

howdy

*

No Parsing Area

*

don't parse me!

* ``` * * The parsing rules, using groups, can be defined as: * * ```java * HtmlEntityList entityList = new HtmlEntityList(); * HtmlParserSettings settings = new HtmlParserSettings(entityList); * * Group group = entityList.configureEntity("test") * .newGroup() * .startAt("div").classes("parseMe") * .endAt("h1"); * * group.addField("greeting").match("p").getText(); * ``` * * The parser will then ignore the `"don't parse me"` paragraph as the group restricts the parsing to the area * defined from a `div` with `class` "parseMe" until an opening `h1` tag. * * @return a {@link GroupStart} used to specify where the {@link Group} starts. */ public final GroupStart newGroup() { return Builder.build(GroupStart.class, this); } /** * Creates a new {@link PaginationGroup} for the {@link HtmlPaginator}. * * @return a {@link PaginationGroupStart} used to specify where {@link PaginationGroup} starts. */ final PaginationGroupStart newPaginationGroup(HtmlPaginator paginator) { return Builder.build(PaginationGroupStart.class, paginator); } /** * Returns a {@link RecordTriggerStart} that is used to specify a path that defines when rows should be created. * * See documentation in {@link Trigger#addRecordTrigger()} for a detailed explanation. * * @return a {@link RecordTriggerStart} that defines the path for the trigger */ public final RecordTriggerStart addRecordTrigger() { RecordTrigger out = Builder.build(RecordTrigger.class, this); addTrigger(out); return out; } @Override public final Set getFieldNames() { return Collections.unmodifiableSet(fields.keySet()); } @Override public final void removeField(String fieldName) { fields.remove(fieldName); } @Override public final void addField(String fieldName, String constantValue) { fields.put(fieldName, constantValue); } @Override public final void addFieldFromParent(String field) { if (getParentEntitySettings() == null) { throw new IllegalArgumentException("Can't add parent field '" + field + "' to '" + getEntityName() + "'. No parent entity defined."); } addFieldFrom(getParentEntitySettings().getEntityName(), field); } @Override public final void addFieldFrom(String parentEntityName, String field) { Args.notBlank(parentEntityName, "Parent entity name"); Args.notBlank(field, "Field from parent entity " + parentEntityName); HtmlEntitySettings parent = getParentEntitySettings(); Set allParentFields = new TreeSet(); while (parent != null) { if (parentEntityName.equalsIgnoreCase(parent.getEntityName())) { Set parentFields = parent.getFieldNames(); if (parentFields.contains(field)) { this.fields.put(field, parent); return; } else { allParentFields.addAll(parentFields); } } parent = parent.getParentEntitySettings(); } if (parent == null) { throw new IllegalArgumentException("Can't find entity '" + parentEntityName + "' in hierarchy of entity '" + this.getEntityName() + "'"); } else { throw new IllegalArgumentException("Can't find field '" + field + "' in parent entity '" + parentEntityName + "'. Available fields are: " + allParentFields); } } /** * Associates a {@link HtmlParserListener} with this HTML entity. The listener methods will be triggered * by the {@link HtmlParser} while it traverses the HTML structure to collect values for the fields of this entity. * In essence, a {@link HtmlParserListener} provides information about events that occur during the parsing process. * * Important:The listener methods are used in a concurrent environment. If you are using the same * instance on multiple entities make sure your listener implementation is thread-safe, or limit the number * of threads to be used when parsing to 1 with {@link HtmlParserSettings#setParserThreadCount(int)} * * @param listener the {@link HtmlParserListener} to be used when the parser executes to collect values for the fields * of this entity. */ public final void setListener(HtmlParserListener listener) { this.listener = listener; } /** * Returns the {@link HtmlParserListener} associated with this HTML entity. The listener methods will be triggered * by the {@link HtmlParser} while it traverses the HTML structure to collect values for the fields of this entity * In essence, a {@link HtmlParserListener} provides information about events that occur during the parsing process. * * Important:The listener methods are used in a concurrent environment. If you are using the same * instance on multiple entities make sure your listener implementation is thread-safe, or limit the number * of threads to be used when parsing to 1 with {@link HtmlParserSettings#setParserThreadCount(int)} * * @return the {@link HtmlParserListener} to be used when the parser executes to collect values for the fields * of this entity. */ public final HtmlParserListener getListener() { return listener; } HtmlLinkFollower addHtmlLinkFollower(String fieldName) { HtmlLinkFollower htmlLinkFollower = new HtmlLinkFollower(this); //FIXME: will break if multiple paths are assigned to same field name and a different link follower is to be used for each path. followers.put(fieldName, htmlLinkFollower); return htmlLinkFollower; } /** * Creates a {@link HtmlLinkFollower} a field with the name provided. The link follower will access the * {@link UrlReaderProvider} all values collected from this resource will be joined with the results of the current * entity using the {@link #getNesting()} strategy defined. * * A parametrized URL can be used here so values from each record produced by this entity can replace parameters * in the URL. Use {@link HtmlLinkFollower#assigning} to replace the URL parameters. * * @param fieldName the name of the field associated with {@link HtmlLinkFollower} * @param urlReaderProvider the url that the {@link HtmlLinkFollower} will follow * * @return this {@link HtmlLinkFollower} to allow for method chaining */ public HtmlLinkFollower followLink(String fieldName, UrlReaderProvider urlReaderProvider) { HtmlLinkFollower follower = ((FieldContentTransform) addField(fieldName)).followLink(urlReaderProvider); return follower; } @Override protected final HtmlEntitySettings getParentEntitySettings() { return (HtmlEntitySettings) parentEntity; } @Override protected final CommonParserSettings getInternalSettings() { return super.getInternalSettings(); } @Override protected HtmlEntitySettings clone() { HtmlEntitySettings out = (HtmlEntitySettings) super.clone(); out.fields = new LinkedHashMap(); return out; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy