com.univocity.api.entity.html.HtmlParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of univocity-html-parser-api Show documentation
Show all versions of univocity-html-parser-api Show documentation
Univocity HTML Parser Public API
The newest version!
/*
* Copyright (c) 2013 Univocity Software Pty Ltd. All rights reserved.
* This file is subject to the terms and conditions defined in file
* 'LICENSE.txt', which is part of this source code package.
*/
package com.univocity.api.entity.html;
import com.univocity.api.*;
import com.univocity.api.io.*;
import com.univocity.parsers.common.*;
import com.univocity.parsers.common.processor.core.*;
import com.univocity.parsers.common.record.*;
import com.univocity.parsers.remote.*;
import java.io.*;
import java.nio.charset.*;
import java.util.*;
/**
* A very fast HTML parser.
*
* @author Univocity Software Pty Ltd - [email protected]
* @see HtmlParserSettings
* @see ReaderProvider
* @see Record
* @see HtmlEntitySettings
*/
public final class HtmlParser implements HtmlParserInterface {
private final HtmlParserInterface parser;
/**
* Creates a new HtmlParser with the entity configuration provided by a {@link HtmlEntityList}. The {@code HtmlParser}
* gets all configuration from this list and from {@link HtmlEntityList#getParserSettings()}.
*
* @param entityList The list of entities to be parsed by the {@code HtmlParser}, and their configuration
*/
public HtmlParser(HtmlEntityList entityList) {
if (entityList == null) {
parser = null;
} else {
parser = Builder.build(HtmlParserInterface.class, entityList);
}
}
/**
* Given an input, made available from a {@link ReaderProvider}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param readerProvider an input provider with content to be parsed
*/
public final Results parse(ReaderProvider readerProvider) {
return parser.parse(readerProvider);
}
/**
* Given an input, made available from a {@link FileProvider}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param fileProvider the input file with content to be parsed
*/
public final Results parse(FileProvider fileProvider) {
return parser.parse(fileProvider);
}
/**
* Given a {@link java.io.File}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* The default system encoding will be used to read text from the given input.
*
* @param file the input with content to be parsed
*/
public final Results parse(File file) {
return parse(new FileProvider(file));
}
/**
* Given a {@link java.io.File}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param file the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*/
public final Results parse(File file, Charset encoding) {
return parse(new FileProvider(file, encoding));
}
/**
* Given a {@link java.io.File}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param file the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*/
public final Results parse(File file, String encoding) {
return parse(new FileProvider(file, encoding));
}
/**
* Given a {@link java.io.Reader}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param reader the input with content to be parsed
*/
public final Results parse(Reader reader) {
return parser.parse(reader);
}
/**
* Given an {@link java.io.InputStream}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* The default system encoding will be used to read text from the given input.
*
* @param inputStream the input with content to be parsed
*/
public final Results parse(InputStream inputStream) {
return parser.parse(inputStream);
}
/**
* Given an {@link java.io.InputStream}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param inputStream the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*/
public final Results parse(InputStream inputStream, Charset encoding) {
return parser.parse(inputStream, encoding);
}
/**
* Given an {@link java.io.InputStream}, parses all records of all entities
* defined in the {@link HtmlEntityList} of the {@link HtmlParserSettings} object provided in the constructor
* of this class, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param inputStream the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*/
public final Results parse(InputStream inputStream, String encoding) {
return parser.parse(inputStream, encoding);
}
/**
* Given a {@link HtmlElement}, parses all records of all entities
* defined in the {@link EntityList} of this parser, submitting them to the {@link Processor} implementation
* associated with each entity (through {@link EntitySettings#setProcessor(Processor)}. The {@link Processor}
* implementation will handle the rows as they come, in its {@link Processor#rowProcessed(String[], Context)} method
* which can accumulate/transform the rows on demand. The behavior and way to collect results is determined by
* the {@link Processor} implementation used.
*
* @param htmlTree the HTML tree with content to be parsed
*/
public final Results parse(HtmlElement htmlTree) {
return parser.parse(htmlTree);
}
/**
* Returns the {@link HtmlPaginationContext} object with information collected for the configured {@link HtmlPaginator}, if
* any. The information returned comes from the last input processed, and might have been modified by a
* {@link NextInputHandler} if it has been associated with the {@link HtmlPaginator}
* using {@link Paginator#setPaginationHandler(NextInputHandler)}.
*
* @return the current {@link PaginationContext} with pagination information captured after parsing a given input.
*/
@Override
public HtmlPaginationContext getPaginationContext() {
return parser.getPaginationContext();
}
/**
* Generates a DOM tree from the input made available by a {@link ReaderProvider}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param readerProvider an input provider with content to be parsed
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(ReaderProvider readerProvider) {
return Builder.build(HtmlTreeParser.class).parseTree(readerProvider);
}
/**
* Generates a DOM tree from the input made available by a {@link FileProvider}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param fileProvider the input file with content to be parsed
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(FileProvider fileProvider) {
return Builder.build(HtmlTreeParser.class).parseTree(fileProvider);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.Reader}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param reader the input with content to be parsed
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(Reader reader) {
return Builder.build(HtmlTreeParser.class).parseTree(reader);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.InputStream}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* The default system encoding will be used to read text from the given input.
*
* @param inputStream the input with content to be parsed
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(InputStream inputStream) {
return Builder.build(HtmlTreeParser.class).parseTree(inputStream);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.InputStream}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param inputStream the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(InputStream inputStream, Charset encoding) {
return Builder.build(HtmlTreeParser.class).parseTree(inputStream, encoding);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.InputStream}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param inputStream the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(InputStream inputStream, String encoding) {
return Builder.build(HtmlTreeParser.class).parseTree(inputStream, encoding);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.File}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* The default system encoding will be used to read text from the given input.
*
* @param file the input with content to be parsed
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(File file) {
return Builder.build(HtmlTreeParser.class).parseTree(file);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.File}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param file the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(File file, Charset encoding) {
return Builder.build(HtmlTreeParser.class).parseTree(file, encoding);
}
/**
* Generates a DOM tree from the input made available by a {@link java.io.File}. Users can navigate the HTML tree
* and use CSS selectors against the {@link HtmlElement}s returned to target any specific HTML node.
*
* @param file the input with content to be parsed
* @param encoding the encoding to be used when reading text from the given input.
*
* @return the root {@link HtmlElement} of the entire HTML document.
*/
public static final HtmlElement parseTree(File file, String encoding) {
return Builder.build(HtmlTreeParser.class).parseTree(file, encoding);
}
}