All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.persist.WebPage Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.persist;

import ai.platon.pulsar.common.DateTimes;
import ai.platon.pulsar.common.HtmlIntegrity;
import ai.platon.pulsar.common.Strings;
import ai.platon.pulsar.common.browser.BrowserType;
import ai.platon.pulsar.common.config.VolatileConfig;
import ai.platon.pulsar.common.urls.UrlUtils;
import ai.platon.pulsar.persist.experimental.WebAsset;
import ai.platon.pulsar.persist.gora.generated.*;
import ai.platon.pulsar.persist.metadata.*;
import ai.platon.pulsar.persist.model.*;
import org.apache.avro.util.Utf8;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.gora.util.ByteUtils;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.xml.sax.InputSource;

import java.io.ByteArrayInputStream;
import java.nio.ByteBuffer;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneId;
import java.time.temporal.ChronoUnit;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.stream.Collectors;

import static ai.platon.pulsar.common.PulsarParams.VAR_LOAD_OPTIONS;
import static ai.platon.pulsar.common.config.AppConstants.*;

/**
 * The core web page structure
 */
final public class WebPage implements Comparable, WebAsset {
    /**
     * The WebPage object sequence number generator.
     * */
    private static final AtomicInteger SEQUENCER = new AtomicInteger();
    /**
     * The nil page.
     * */
    public static final WebPage NIL = newInternalPage(NIL_PAGE_URL, 0, "nil", "nil");
    /**
     * The page id which is unique in process scope.
     * */
    private Integer id = SEQUENCER.incrementAndGet();
    /**
     * The url is the permanent internal address, while the location is the last working address.
     */
    @NotNull
    private String url = "";
    /**
     * The reversed url of the web page, it's also the key of the underlying storage of this webpage.
     * It's faster to retrieve the page by the reversed url.
     */
    @NotNull
    private String reversedUrl = "";
    /**
     * The underlying persistent object.
     */
    @NotNull
    private GWebPage page;
    /**
     * A webpage scope configuration, any modifications made to it will exclusively impact this particular webpage.
     */
    @NotNull
    private VolatileConfig conf;
    /**
     * Web page scope variables
     */
    private final Variables variables = new Variables();
    /**
     * Store arbitrary data associated with the webpage.
     */
    private final Variables data = new Variables();
    /**
     * The page datum for update.
     * Page datum is collected during the fetch phrase and is used to update the page in the update phase.
     * */
    private PageDatum pageDatum = null;
    /**
     * If this page is fetched from Internet
     */
    private boolean isCached = false;

    /**
     * If this page is loaded from database or is created and fetched from the web
     */
    private boolean isLoaded = false;

    /**
     * If this page is fetched from Internet
     */
    private boolean isFetched = false;

    /**
     * If this page is canceled
     */
    private boolean isCanceled = false;

    /**
     * If this page is fetched and updated
     */
    private volatile boolean isContentUpdated = false;

    /**
     * The cached content.
     * TODO: use a loading cache for all cached page contents.
     */
    private volatile ByteBuffer tmpContent = null;

    /**
     * The delay time to retry if a retry is needed
     */
    private Duration retryDelay = Duration.ZERO;

    /**
     * The field loader to load fields lazily.
     */
    private Function lazyFieldLoader = null;

    private final List lazyLoadedFields = new ArrayList<>();

    private final Object CONTENT_MONITOR = new Object();
    private final Object PAGE_MODEL_MONITOR = new Object();
//    private final Deque lazyLoadedFields = new ConcurrentLinkedDeque<>();

    private WebPage(
            @NotNull String urlOrKey, @NotNull GWebPage page, boolean urlReversed, @NotNull VolatileConfig conf
    ) {
        this.url = urlReversed ? UrlUtils.unreverseUrl(urlOrKey) : urlOrKey;
        this.reversedUrl = urlReversed ? urlOrKey : UrlUtils.reverseUrlOrEmpty(urlOrKey);
        this.conf = conf;
        this.page = page;

        if (page.getBaseUrl() == null) {
            setLocation(this.url);
        }
    }

    private WebPage(
            @NotNull String url, @NotNull String reversedUrl, @NotNull GWebPage page, @NotNull VolatileConfig conf
    ) {
        this.url = url;
        this.reversedUrl = reversedUrl;
        this.conf = conf;
        this.page = page;

        if (page.getBaseUrl() == null) {
            setLocation(this.url);
        }
    }

    @NotNull
    public static WebPage newWebPage(@NotNull String url, @NotNull VolatileConfig conf) {
        return newWebPage(url, conf, null);
    }

    @NotNull
    public static WebPage newWebPage(@NotNull String url, @NotNull VolatileConfig conf, @Nullable String href) {
        return newWebPageInternal(url, conf, href);
    }

    @NotNull
    private static WebPage newWebPageInternal(@NotNull String url, @NotNull VolatileConfig conf, @Nullable String href) {
        WebPage page = new WebPage(url, GWebPage.newBuilder().build(), false, conf);

        page.setLocation(url);
        page.setConf(conf);
        page.setHref(href);
        page.setCrawlStatus(CrawlStatus.STATUS_UNFETCHED);
        page.setCreateTime(Instant.now());
        page.setModifiedTime(Instant.now());
        page.setScore(0);
        page.setFetchCount(0);

        return page;
    }

    @NotNull
    public static WebPage newInternalPage(@NotNull String url) {
        return newInternalPage(url, "internal", "internal");
    }

    @NotNull
    public static WebPage newInternalPage(@NotNull String url, @NotNull String title) {
        return newInternalPage(url, title, "internal");
    }

    @NotNull
    public static WebPage newInternalPage(@NotNull String url, @NotNull String title, @NotNull String content) {
        return newInternalPage(url, -1, title, content);
    }

    @NotNull
    public static WebPage newInternalPage(@NotNull String url, int id, @NotNull String title, @NotNull String content) {
        VolatileConfig unsafe = VolatileConfig.Companion.getUNSAFE();
        WebPage page = WebPage.newWebPage(url, unsafe);
        if (id >= 0) {
            page.id = id;
        }

        page.setLocation(url);
        page.setModifiedTime(Instant.EPOCH);
        page.setPrevFetchTime(Instant.EPOCH);
        page.setFetchTime(DateTimes.INSTANCE.getDoomsday());
        page.setFetchInterval(ChronoUnit.CENTURIES.getDuration());
        page.setFetchPriority(FETCH_PRIORITY_MIN);
        page.setCrawlStatus(CrawlStatus.STATUS_UNFETCHED);

        page.setDistance(DISTANCE_INFINITE); // or -1?
        page.getMarks().put(Mark.INTERNAL, YES_STRING);
        page.getMarks().put(Mark.INACTIVE, YES_STRING);

        page.setPageTitle(title);
        page.setContent(content);

        return page;
    }

    /**
     * Initialize a WebPage with the underlying GWebPage instance.
     */
    @NotNull
    public static WebPage box(
            @NotNull String url, @NotNull String reversedUrl, @NotNull GWebPage page, @NotNull VolatileConfig conf) {
        return new WebPage(url, reversedUrl, page, conf);
    }

    /**
     * Initialize a WebPage with the underlying GWebPage instance.
     */
    @NotNull
    public static WebPage box(@NotNull String url, @NotNull GWebPage page, @NotNull VolatileConfig conf) {
        return box(url, page, false, conf);
    }

    /**
     * Initialize a WebPage with the underlying GWebPage instance.
     */
    @NotNull
    public static WebPage box(
            @NotNull String url, @NotNull GWebPage page, boolean urlReversed, @NotNull VolatileConfig conf
    ) {
        return new WebPage(url, page, urlReversed, conf);
    }

    /**
     * Return a Utf8 string.
     *
     * Unlike {@link String}, instances are mutable. This is more
     * efficient than {@link String} when reading or writing a sequence of values,
     * as a single instance may be reused.
     * */
    @NotNull
    public static Utf8 wrapKey(@NotNull String key) {
        return u8(key);
    }

    /**
     * Return a Utf8 string.
     * 

* Unlike {@link String}, instances are mutable. This is more * efficient than {@link String} when reading or writing a sequence of values, * as a single instance may be reused. * */ @NotNull public static Utf8 wrapKey(@NotNull Mark mark) { return u8(mark.value()); } /** * Return a Utf8 string. *

* Unlike {@link String}, instances are mutable. This is more * efficient than {@link String} when reading or writing a sequence of values, * as a single instance may be reused. * */ @Nullable public static Utf8 u8(@Nullable String value) { if (value == null) { // TODO: return new Utf8.EMPTY? return null; } return new Utf8(value); } @NotNull public String getUrl() { return url; } @NotNull public String getKey() { return getReversedUrl(); } @NotNull public String getReversedUrl() { return reversedUrl; } /** * A process scope page id. * */ public int getId() { return id; } /** * Get The hypertext reference of this page. * It defines the address of the document, which this time is linked from *

* TODO: use a separate field for href * * @return The hypertext reference */ @Nullable public String getHref() { return getMetadata().get(Name.HREF); } /** * Set The hypertext reference of this page. * It defines the address of the document, which this time is linked from * * @param href The hypertext reference */ public void setHref(@Nullable String href) { getMetadata().set(Name.HREF, href); } public boolean isNil() { return this == NIL; } public boolean isNotNil() { return !isNil(); } public boolean isInternal() { return hasMark(Mark.INTERNAL); } public boolean isNotInternal() { return !isInternal(); } @NotNull public GWebPage unbox() { return page; } public void unsafeSetGPage(@NotNull GWebPage page) { this.page = page; } public void unsafeCloneGPage(WebPage page) { unsafeSetGPage(GWebPage.newBuilder(page.unbox()).build()); } ////////////////////////////////////////////////////////////////////////////////// // // Common fields // @NotNull public Variables getVariables() { return variables; } /** * Check if the page scope temporary variable with {@code name} exists * * @param name The variable name to check * @return true if the variable exist */ public boolean hasVar(@NotNull String name) { return variables.contains(name); } /** * Returns the page scope temporary variable to which the specified name is mapped, * or {@code null} if the local variable map contains no mapping for the name. * * @param name the name whose associated value is to be returned * @return the value to which the specified name is mapped, or * {@code null} if the local variable map contains no mapping for the key */ @Nullable public Object getVar(@NotNull String name) { return variables.get(name); } /** * Returns the page scope temporary variable to which the specified name is mapped, * or {@code null} if the local variable map contains no mapping for the name. * * @param clazz the class of the variable */ @Nullable public Object getVar(Class clazz) { return variables.get(clazz.getName()); } /** * Retrieves and removes the local variable with the given name. */ public Object removeVar(@NotNull String name) { return variables.remove(name); } /** * Set a page scope temporary variable. * * @param name The variable name. * @param value The variable value. */ public void setVar(@NotNull String name, @NotNull Object value) { variables.set(name, value); } public void setVar(@NotNull T value) { variables.set(value.getClass().getName(), value); } /** * Returns the data to which the specified name is mapped, * or {@code null} if the data map contains no mapping for the name. * * @param name the name whose associated value is to be returned * @return the value to which the specified name is mapped, or * {@code null} if the local variable map contains no mapping for the key */ @Nullable public Object data(@NotNull String name) { return data.get(name); } /** * Store arbitrary data associated with the webpage. * * @param name A string naming the piece of data to set. * @param value The new data value. */ public void data(@NotNull String name, @Nullable Object value) { if (value == null) { data.remove(name); } else { data.set(name, value); } } @Nullable public PageDatum getPageDatum() { return pageDatum; } public void setPageDatum(PageDatum pageDatum) { this.pageDatum = pageDatum; } public boolean isCached() { return isCached; } public void setCached(boolean cached) { isCached = cached; } public boolean isLoaded() { return isLoaded; } public void setLoaded(boolean loaded) { isLoaded = loaded; } public boolean isFetched() { return isFetched; } public void setFetched(boolean fetched) { isFetched = fetched; } /** * Check if the page is canceled. *

* If a page is canceled, it should not be updated. * */ public boolean isCanceled() { return isCanceled; } /** * Check if the page is canceled. *

* If a page is canceled, it should not be updated. * */ public void setCanceled(boolean canceled) { isCanceled = canceled; } public boolean isContentUpdated() { return isContentUpdated; } @NotNull public VolatileConfig getConf() { return conf; } public void setConf(@NotNull VolatileConfig conf) { this.conf = conf; } public Metadata getMetadata() { return Metadata.box(page.getMetadata()); } /** * CrawlMarks are used for nutch style crawling. * */ public CrawlMarks getMarks() { return CrawlMarks.box(page.getMarkers()); } /** * Check if a mark is marked. *

* CrawlMarks are used for nutch style crawling. * */ public boolean hasMark(Mark mark) { return page.getMarkers().get(wrapKey(mark)) != null; } /** * The load arguments is variant task by task, so the local version is the first choice, * while the persisted version is used for historical check only *

* Underlying gora field should not use name 'args' which is already used, * see GProtocolStatus.args and GParseStatus.args */ @NotNull public String getArgs() { // Underlying gora field should not use name 'args' which is already used. CharSequence args = page.getParams(); return args != null ? args.toString() : ""; } /** * Set the arguments and clear the LoadOptions object. * */ public void setArgs(@NotNull String args) { variables.remove(VAR_LOAD_OPTIONS); page.setParams(args); } @NotNull public Duration getRetryDelay() { return retryDelay; } public void setRetryDelay(@NotNull Duration retryDelay) { this.retryDelay = retryDelay; } /** * Set a field loader, the loader takes a parameter as the field name, * and returns a GWebPage containing the field. * */ public void setLazyFieldLoader(Function lazyFieldLoader) { this.lazyFieldLoader = lazyFieldLoader; } public int getMaxRetries() { return getMetadata().getInt(Name.FETCH_MAX_RETRY, 3); } public void setMaxRetries(int maxRetries) { getMetadata().set(Name.FETCH_MAX_RETRY, maxRetries); } @NotNull public String getConfiguredUrl() { return UrlUtils.mergeUrlArgs(url, getArgs()); } public int getFetchedLinkCount() { return getMetadata().getInt(Name.FETCHED_LINK_COUNT, 0); } public void setFetchedLinkCount(int count) { getMetadata().set(Name.FETCHED_LINK_COUNT, count); } @NotNull public ZoneId getZoneId() { return page.getZoneId() == null ? DateTimes.INSTANCE.getZoneId() : ZoneId.of(page.getZoneId().toString()); } public void setZoneId(@NotNull ZoneId zoneId) { page.setZoneId(zoneId.getId()); } @Nullable public String getBatchId() { return page.getBatchId() == null ? null : page.getBatchId().toString(); } public void setBatchId(@Nullable String value) { // TODO: use Utf8 which is an optimized string // page.setBatchId(Utf8(value)); page.setBatchId(value); } /** * Mark this page as a seed where a crawl job starts from. * */ public void markSeed() { getMetadata().set(Name.IS_SEED, YES_STRING); } /** * Unmark this page to be a seed. * */ public void unmarkSeed() { getMetadata().remove(Name.IS_SEED); } /** * Check whether this page is a seed. * */ public boolean isSeed() { return getMetadata().contains(Name.IS_SEED); } /** * Get the distance of the page from the seed in the graph. * */ public int getDistance() { int distance = page.getDistance(); return distance < 0 ? DISTANCE_INFINITE : distance; } /** * Set the distance of the page from the seed in the graph. * */ public void setDistance(int newDistance) { page.setDistance(newDistance); } /** * Get the fetch mode, only BROWSER mode is supported currently. * Fetch mode is used to determine the protocol before fetch, so it shall be set before fetch. */ @NotNull public FetchMode getFetchMode() { return FetchMode.fromString(getMetadata().get(Name.FETCH_MODE)); } /** * Get the fetch mode, only BROWSER mode is supported currently. * Fetch mode is used to determine the protocol before fetch, so it shall be set before fetch */ public void setFetchMode(@NotNull FetchMode mode) { getMetadata().set(Name.FETCH_MODE, mode.name()); } /** * Get the browser used to fetch the page last time. */ @NotNull public BrowserType getLastBrowser() { String browser = page.getBrowser() != null ? page.getBrowser().toString() : ""; return BrowserType.fromString(browser); } /** * Set the browser used to fetch the page. */ public void setLastBrowser(@NotNull BrowserType browser) { page.setBrowser(browser.name()); } /** * Checks whether the page is a single resource which can be fetched by a single request. */ public boolean isResource() { return page.getResource() != null; } /** * Indicates the page to be a single resource that can be fetched by a single request. */ public void setResource(boolean resource) { if (resource) { page.setResource(1); } } @NotNull public HtmlIntegrity getHtmlIntegrity() { String integrity = page.getHtmlIntegrity() != null ? page.getHtmlIntegrity().toString() : ""; return HtmlIntegrity.Companion.fromString(integrity); } public void setHtmlIntegrity(@NotNull HtmlIntegrity integrity) { page.setHtmlIntegrity(integrity.name()); } public int getFetchPriority() { return page.getFetchPriority() > 0 ? page.getFetchPriority() : FETCH_PRIORITY_DEFAULT; } public void setFetchPriority(int priority) { page.setFetchPriority(priority); } public int sniffFetchPriority() { int priority = getFetchPriority(); int depth = getDistance(); if (depth < FETCH_PRIORITY_DEPTH_BASE) { priority = Math.max(priority, FETCH_PRIORITY_DEPTH_BASE - depth); } return priority; } @NotNull public Instant getCreateTime() { return Instant.ofEpochMilli(page.getCreateTime()); } public void setCreateTime(@NotNull Instant createTime) { page.setCreateTime(createTime.toEpochMilli()); } @NotNull public Instant getGenerateTime() { String generateTime = getMetadata().get(Name.GENERATE_TIME); if (generateTime == null) { return Instant.EPOCH; } else { return Instant.parse(generateTime); } } public void setGenerateTime(@NotNull Instant generateTime) { getMetadata().set(Name.GENERATE_TIME, generateTime.toString()); } public int getFetchCount() { return page.getFetchCount(); } public void setFetchCount(int count) { page.setFetchCount(count); } public void updateFetchCount() { int count = getFetchCount(); setFetchCount(count + 1); } @NotNull public CrawlStatus getCrawlStatus() { return new CrawlStatus(page.getCrawlStatus().byteValue()); } public void setCrawlStatus(@NotNull CrawlStatus crawlStatus) { page.setCrawlStatus(crawlStatus.getCode()); } public void setCrawlStatus(int value) { page.setCrawlStatus(value); } /** * The URL where the HTML was retrieved from, to resolve relative links against. *

* A baseUrl has the same semantic with Jsoup.parse: * * @return a {@link String} object. * @see Jsoup.parse * @see WebPage#getLocation */ public String getBaseUrl() { return page.getBaseUrl() == null ? "" : page.getBaseUrl().toString(); } /** * Get the URL this Document was parsed from. If the starting URL is a redirect, * this will return the final URL from which the document was served from. *

* WebPage.url is the permanent internal address, it might not still available to access the target. * And WebPage.location or WebPage.baseUrl is the last working address, it might redirect to url, * or it might have additional random parameters. * WebPage.location may be different from url which is generally normalized. *

* TODO: location is usually not the same as baseUrl, set it properly */ public String getLocation() { return getBaseUrl(); } /** * The url is the permanent internal address, it might not still available to access the target. *

* Location is the last working address, it might redirect to url, or it might have additional random parameters. *

* Location may be different from url, it's generally normalized. * * TODO: location is usually not the same as baseUrl, set it properly * * @param location The location. */ public void setLocation(@NotNull String location) { page.setBaseUrl(location); } /** * The latest fetch time * * @return The latest fetch time */ @NotNull public Instant getFetchTime() { return Instant.ofEpochMilli(page.getFetchTime()); } /** * The latest fetch time * * @param time The latest fetch time */ public void setFetchTime(@NotNull Instant time) { page.setFetchTime(time.toEpochMilli()); } /** * The previous fetch time, updated at the fetch stage * * @return The previous fetch time. */ @NotNull public Instant getPrevFetchTime() { return Instant.ofEpochMilli(page.getPrevFetchTime()); } public void setPrevFetchTime(@NotNull Instant time) { page.setPrevFetchTime(time.toEpochMilli()); } /** * The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched * */ @NotNull public Instant getPrevCrawlTime1() { return Instant.ofEpochMilli(page.getPrevCrawlTime1()); } /** * The previous crawl time, used for fat link crawl, which means both the page itself and out pages are fetched * */ public void setPrevCrawlTime1(@NotNull Instant time) { page.setPrevCrawlTime1(time.toEpochMilli()); } /** * Get fetch interval * */ @NotNull public Duration getFetchInterval() { long seconds = page.getFetchInterval(); if (seconds < 0) { seconds = ChronoUnit.CENTURIES.getDuration().getSeconds(); } return Duration.ofSeconds(seconds); } /** * Set fetch interval * */ public void setFetchInterval(@NotNull Duration duration) { page.setFetchInterval((int) duration.getSeconds()); } /** * Set fetch interval in seconds * */ public void setFetchInterval(long seconds) { page.setFetchInterval((int) seconds); } /** * Set fetch interval in seconds * */ public void setFetchInterval(float seconds) { page.setFetchInterval(Math.round(seconds)); } /** * Get protocol status * */ @NotNull public ProtocolStatus getProtocolStatus() { GProtocolStatus protocolStatus = page.getProtocolStatus(); if (protocolStatus == null) { protocolStatus = GProtocolStatus.newBuilder().build(); } return ProtocolStatus.box(protocolStatus); } /** * Set protocol status * */ public void setProtocolStatus(@NotNull ProtocolStatus protocolStatus) { page.setProtocolStatus(protocolStatus.unbox()); } /** * Header information returned from the web server used to server the content which is subsequently fetched from. * This includes keys such as * TRANSFER_ENCODING, * CONTENT_ENCODING, * CONTENT_LANGUAGE, * CONTENT_LENGTH, * CONTENT_LOCATION, * CONTENT_DISPOSITION, * CONTENT_MD5, * CONTENT_TYPE, * LAST_MODIFIED * and LOCATION. */ @NotNull public ProtocolHeaders getHeaders() { return ProtocolHeaders.box(page.getHeaders()); } @NotNull public String getReprUrl() { return page.getReprUrl() == null ? "" : page.getReprUrl().toString(); } public void setReprUrl(@NotNull String value) { page.setReprUrl(value); } public int getFetchRetries() { return page.getFetchRetries(); } public void setFetchRetries(int value) { page.setFetchRetries(value); } @NotNull public Instant getModifiedTime() { return Instant.ofEpochMilli(page.getModifiedTime()); } public void setModifiedTime(@NotNull Instant value) { page.setModifiedTime(value.toEpochMilli()); } @NotNull public Instant getPrevModifiedTime() { return Instant.ofEpochMilli(page.getPrevModifiedTime()); } public void setPrevModifiedTime(@NotNull Instant value) { page.setPrevModifiedTime(value.toEpochMilli()); } @NotNull public String getFetchTimeHistory(@NotNull String defaultValue) { String s = getMetadata().get(Name.FETCH_TIME_HISTORY); return s == null ? defaultValue : s; } @NotNull public PageCategory getPageCategory() { try { CharSequence pageCategory = page.getPageCategory(); if (pageCategory != null) { return PageCategory.parse(pageCategory.toString()); } } catch (Throwable ignored) { } return PageCategory.UNKNOWN; } @NotNull public OpenPageCategory getOpenPageCategory() { try { CharSequence pageCategory = page.getPageCategory(); if (pageCategory != null) { return OpenPageCategory.Companion.parse(pageCategory.toString()); } } catch (Throwable ignored) { } return new OpenPageCategory("", ""); } /** * category : index, detail, review, media, search, etc * * @param pageCategory a {@link PageCategory} object. */ public void setPageCategory(@NotNull PageCategory pageCategory) { page.setPageCategory(pageCategory.format()); } public void setPageCategory(@NotNull OpenPageCategory pageCategory) { page.setPageCategory(pageCategory.format()); } /** * Get the encoding of the content. * Content encoding is detected just before it's parsed. *

* Not used if fetch mode is browser since the page content retrieved from a browser will always be UTF-8. */ @Nullable public String getEncoding() { return page.getEncoding() == null ? null : page.getEncoding().toString(); } /** * Set the encoding of the content. * Content encoding is detected just before it's parsed. *

* Not used if fetch mode is browser since the page content retrieved from a browser will always be UTF-8. */ public void setEncoding(@Nullable String encoding) { page.setEncoding(encoding); } /** * The entire raw document content e.g. raw XHTML * * @return The raw document content in {@link ByteBuffer}. */ @Nullable public ByteBuffer getContent() { if (tmpContent != null) { return tmpContent; } return getPersistContent(); } /** * Get the cached content */ @Nullable public ByteBuffer getTmpContent() { return tmpContent; } /** * Set the cached content, keep the persisted page content unmodified */ public void setTmpContent(ByteBuffer tmpContent) { this.tmpContent = tmpContent; } /** * Get the persisted page content */ @Nullable public ByteBuffer getPersistContent() { synchronized (CONTENT_MONITOR) { String fieldName = GWebPage.Field.CONTENT.getName(); // load content lazily if (page.getContent() == null && lazyFieldLoader != null && !lazyLoadedFields.contains(fieldName)) { lazyLoadedFields.add(fieldName); GWebPage lazyPage = lazyFieldLoader.apply(fieldName); page.setContent(lazyPage.getContent()); } return page.getContent(); } } /** * Get content as bytes, the underling buffer is duplicated * * @return a duplication of the underling buffer. */ @NotNull public byte[] getContentAsBytes() { ByteBuffer content = getContent(); if (content == null) { return ByteUtils.toBytes('\0'); } return ByteUtils.toBytes(content); } /** * Get the page content as a string, if the underlying page content is null, return an empty string */ @NotNull public String getContentAsString() { ByteBuffer buffer = getContent(); if (buffer == null || buffer.remaining() == 0) { return ""; } return new String(buffer.array(), buffer.arrayOffset(), buffer.limit()); } /** * Get the page content as input stream */ @NotNull public ByteArrayInputStream getContentAsInputStream() { ByteBuffer contentInOctets = getContent(); if (contentInOctets == null) { return new ByteArrayInputStream(ByteUtils.toBytes('\0')); } return new ByteArrayInputStream(getContent().array(), contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()); } /** * Get the page content as sax input source */ @NotNull public InputSource getContentAsSaxInputSource() { InputSource inputSource = new InputSource(getContentAsInputStream()); String encoding = getEncoding(); if (encoding != null) { inputSource.setEncoding(encoding); } return inputSource; } /** * Set the page content */ public void setContent(@Nullable String value) { if (value != null) { setContent(value.getBytes()); } else { setContent((ByteBuffer) null); } } /** * Set the page content */ public void setContent(@Nullable byte[] value) { if (value != null) { setContent(ByteBuffer.wrap(value)); } else { setContent((ByteBuffer) null); } } /** * Set the page content. * * TODO: check consistency again * * @param value a ByteBuffer */ public void setContent(@Nullable ByteBuffer value) { synchronized (CONTENT_MONITOR) { if (value != null) { page.setContent(value); isContentUpdated = true; long length = value.array().length; // save the length of the persisted content, // so we can query the length without loading the big or even huge content field setPersistedContentLength(length); length = getOriginalContentLength(); if (length <= 0) { // TODO: it's for old version compatible length = value.array().length; } computeContentLength(length); } else { clearPersistContent(); } } } /** * Clear persist content, so the content will not write to the disk. * */ public void clearPersistContent() { synchronized (CONTENT_MONITOR) { tmpContent = page.getContent(); page.setContent(null); setPersistedContentLength(0); // TODO: check consistency // lazyLoadedFields.remove(GWebPage.Field.CONTENT.getName()); } } /** * Get the length of content in bytes. * * @return The length of the content in bytes. */ public long getContentLength() { return page.getContentLength() != null ? page.getContentLength() : 0; } /** * Get the length of the original page content in bytes, the content has no pulsar metadata inserted. * * @return The length of the original page content in bytes, nagative means not specified * */ public long getOriginalContentLength() { return getMetadata().getLong(Name.ORIGINAL_CONTENT_LENGTH, -1); } /** * Set the length of the original page content in bytes, the content has no pulsar metadata inserted. * @param length The length of the original page content in bytes, nagative means not specified * */ public void setOriginalContentLength(int length) { getMetadata().set(Name.ORIGINAL_CONTENT_LENGTH, "" + length); } /** * Compute the length of content in bytes. */ private void computeContentLength(long bytes) { long lastBytes = getContentLength(); page.setLastContentLength(lastBytes); page.setContentLength(bytes); computeAveContentLength(bytes); } private void computeAveContentLength(long bytes) { int count = getFetchCount(); long lastAveBytes = page.getAveContentLength(); long aveBytes; if (count > 0 && lastAveBytes == 0) { // old version, average bytes is not calculated aveBytes = bytes; } else { aveBytes = (lastAveBytes * count + bytes) / (count + 1); } page.setAveContentLength(aveBytes); } /** * TODO: if the page is loaded from cache, the persisted content length is zero, check this logic * */ public long getPersistedContentLength() { return page.getPersistedContentLength() != null ? page.getPersistedContentLength() : 0; } private void setPersistedContentLength(long bytes) { page.setPersistedContentLength(bytes); } public long getLastContentLength() { return page.getLastContentLength() != null ? page.getLastContentLength() : 0; } public long getAveContentLength() { return page.getAveContentLength() != null ? page.getAveContentLength() : 0; } @NotNull public String getContentType() { return page.getContentType() == null ? "" : page.getContentType().toString(); } public void setContentType(String value) { page.setContentType(value.trim().toLowerCase()); } @Nullable public ByteBuffer getPrevSignature() { return page.getPrevSignature(); } public void setPrevSignature(@Nullable ByteBuffer value) { page.setPrevSignature(value); } @NotNull public String getPrevSignatureAsString() { ByteBuffer sig = getPrevSignature(); if (sig == null) { sig = ByteBuffer.wrap("".getBytes()); } return Strings.toHexString(sig); } /** * The last proxy used to fetch the page */ public String getProxy() { return page.getProxy() == null ? null : page.getProxy().toString(); } /** * The last proxy used to fetch the page */ public void setProxy(@Nullable String proxy) { page.setProxy(proxy); } @Nullable public ActiveDOMStatus getActiveDOMStatus() { GActiveDOMStatus s = page.getActiveDOMStatus(); if (s == null) return null; return new ActiveDOMStatus( s.getN(), s.getScroll(), s.getSt().toString(), s.getR().toString(), s.getIdl().toString(), s.getEc().toString() ); } public void setActiveDOMStatus(@Nullable ActiveDOMStatus s) { if (s == null) { return; } GActiveDOMStatus s2 = page.getActiveDOMStatus(); if (s2 != null) { s2.setN(s.getN()); s2.setScroll(s.getScroll()); s2.setSt(s.getSt()); s2.setR(s.getR()); s2.setIdl(s.getIdl()); s2.setEc(s.getEc()); } } @NotNull public Map getActiveDOMStatTrace() { Map s = page.getActiveDOMStatTrace(); return s.entrySet().stream().collect(Collectors.toMap( e -> e.getKey().toString(), e -> Converters.INSTANCE.convert(e.getValue()) )); } public void setActiveDOMStatTrace(@NotNull Map trace) { Map statTrace = trace.entrySet().stream().collect( Collectors.toMap(Map.Entry::getKey, e -> Converters.INSTANCE.convert(e.getValue()))); page.setActiveDOMStatTrace(statTrace); } /** * An implementation of a WebPage's signature from which it can be identified and referenced at any point in time. * This is essentially the WebPage's fingerprint representing its state for any point in time. */ @Nullable public ByteBuffer getSignature() { return page.getSignature(); } public void setSignature(byte[] value) { page.setSignature(ByteBuffer.wrap(value)); } @NotNull public String getSignatureAsString() { ByteBuffer sig = getSignature(); if (sig == null) { sig = ByteBuffer.wrap("".getBytes()); } return Strings.toHexString(sig); } @NotNull public String getPageTitle() { return page.getPageTitle() == null ? "" : page.getPageTitle().toString(); } public void setPageTitle(String pageTitle) { page.setPageTitle(pageTitle); } @NotNull public String getContentTitle() { return page.getContentTitle() == null ? "" : page.getContentTitle().toString(); } public void setContentTitle(String contentTitle) { if (contentTitle != null) { page.setContentTitle(contentTitle); } } @NotNull public String getPageText() { return page.getPageText() == null ? "" : page.getPageText().toString(); } public void setPageText(String value) { if (value != null && !value.isEmpty()) page.setPageText(value); } @NotNull public String getContentText() { return page.getContentText() == null ? "" : page.getContentText().toString(); } public void setContentText(String textContent) { if (textContent != null && !textContent.isEmpty()) { page.setContentText(textContent); page.setContentTextLen(textContent.length()); } } public int getContentTextLen() { return page.getContentTextLen(); } @NotNull public ParseStatus getParseStatus() { GParseStatus parseStatus = page.getParseStatus(); return ParseStatus.box(parseStatus == null ? GParseStatus.newBuilder().build() : parseStatus); } public void setParseStatus(ParseStatus parseStatus) { page.setParseStatus(parseStatus.unbox()); } public Map getLiveLinks() { return page.getLiveLinks(); } public Collection getSimpleLiveLinks() { return CollectionUtils.collect(page.getLiveLinks().keySet(), CharSequence::toString); } public void setLiveLinks(Iterable liveLinks) { page.getLiveLinks().clear(); Map links = page.getLiveLinks(); liveLinks.forEach(l -> links.put(l.getUrl(), l.unbox())); } public void setLiveLinks(Map links) { page.setLiveLinks(links); } public void addLiveLink(HyperlinkPersistable hyperLink) { page.getLiveLinks().put(hyperLink.getUrl(), hyperLink.unbox()); } public Map getVividLinks() { return page.getVividLinks(); } public Collection getSimpleVividLinks() { return CollectionUtils.collect(page.getVividLinks().keySet(), CharSequence::toString); } public void setVividLinks(Map links) { page.setVividLinks(links); } public List getDeadLinks() { return page.getDeadLinks(); } public void setDeadLinks(List deadLinks) { page.setDeadLinks(deadLinks); } public List getLinks() { return page.getLinks(); } public void setLinks(List links) { page.setLinks(links); } public int getImpreciseLinkCount() { return getMetadata().getInt(Name.TOTAL_OUT_LINKS, 0); } public void setImpreciseLinkCount(int count) { getMetadata().set(Name.TOTAL_OUT_LINKS, String.valueOf(count)); } public Map getInlinks() { return page.getInlinks(); } @NotNull public CharSequence getAnchor() { return page.getAnchor() != null ? page.getAnchor() : ""; } public void setAnchor(CharSequence anchor) { page.setAnchor(anchor); } public String[] getInlinkAnchors() { return StringUtils.split(getMetadata().getOrDefault(Name.ANCHOR_COUNT, ""), "\n"); } public void setInlinkAnchors(Collection anchors) { getMetadata().set(Name.ANCHOR_COUNT, StringUtils.join(anchors, "\n")); } public int getAnchorOrder() { return page.getAnchorOrder() < 0 ? MAX_LIVE_LINK_PER_PAGE : page.getAnchorOrder(); } public void setAnchorOrder(int order) { page.setAnchorOrder(order); } public Instant getContentPublishTime() { return Instant.ofEpochMilli(page.getContentPublishTime()); } public void setContentPublishTime(Instant publishTime) { page.setContentPublishTime(publishTime.toEpochMilli()); } public boolean isValidContentModifyTime(Instant publishTime) { return publishTime.isAfter(MIN_ARTICLE_PUBLISH_TIME); } public Instant getPrevContentPublishTime() { return Instant.ofEpochMilli(page.getPrevContentPublishTime()); } public void setPrevContentPublishTime(Instant publishTime) { page.setPrevContentPublishTime(publishTime.toEpochMilli()); } public Instant getRefContentPublishTime() { return Instant.ofEpochMilli(page.getRefContentPublishTime()); } public void setRefContentPublishTime(Instant publishTime) { page.setRefContentPublishTime(publishTime.toEpochMilli()); } public Instant getContentModifiedTime() { return Instant.ofEpochMilli(page.getContentModifiedTime()); } public void setContentModifiedTime(Instant modifiedTime) { page.setContentModifiedTime(modifiedTime.toEpochMilli()); } public Instant getPrevContentModifiedTime() { return Instant.ofEpochMilli(page.getPrevContentModifiedTime()); } public void setPrevContentModifiedTime(Instant modifiedTime) { page.setPrevContentModifiedTime(modifiedTime.toEpochMilli()); } public Instant getPrevRefContentPublishTime() { return Instant.ofEpochMilli(page.getPrevRefContentPublishTime()); } public void setPrevRefContentPublishTime(Instant publishTime) { page.setPrevRefContentPublishTime(publishTime.toEpochMilli()); } @Nullable public String getReferrer() { return page.getReferrer() == null ? null : page.getReferrer().toString(); } public void setReferrer(@Nullable String referrer) { if (UrlUtils.isStandard(referrer)) { page.setReferrer(referrer); } } /** * ***************************************************************************** * Page Model * ****************************************************************************** */ @Nullable public Instant getPageModelUpdateTime() { return Instant.ofEpochMilli(page.getPageModelUpdateTime()); } public void setPageModelUpdateTime(@Nullable Instant time) { page.setPageModelUpdateTime(time == null ? 0 : time.toEpochMilli()); } @Nullable public PageModel getPageModel() { synchronized (PAGE_MODEL_MONITOR) { String fieldName = GWebPage.Field.PAGE_MODEL.getName(); // load content lazily if (page.getPageModel() == null && lazyFieldLoader != null && !lazyLoadedFields.contains(fieldName)) { lazyLoadedFields.add(fieldName); GWebPage lazyPage = lazyFieldLoader.apply(fieldName); page.setPageModel(lazyPage.getPageModel()); } return page.getPageModel() == null ? null : PageModel.box(page.getPageModel()); } } @NotNull public PageModel ensurePageModel() { synchronized (PAGE_MODEL_MONITOR) { if (page.getPageModel() == null) { page.setPageModel(GPageModel.newBuilder().build()); } return Objects.requireNonNull(getPageModel()); } } /** * ***************************************************************************** * Scoring * ****************************************************************************** */ public float getScore() { return page.getScore(); } public void setScore(float value) { page.setScore(value); } public float getContentScore() { return page.getContentScore() == null ? 0.0f : page.getContentScore(); } public void setContentScore(float score) { page.setContentScore(score); } @NotNull public String getSortScore() { return page.getSortScore() == null ? "" : page.getSortScore().toString(); } public void setSortScore(String score) { page.setSortScore(score); } public float getCash() { return getMetadata().getFloat(Name.CASH_KEY, 0.0f); } public void setCash(float cash) { getMetadata().set(Name.CASH_KEY, String.valueOf(cash)); } @NotNull public PageCounters getPageCounters() { return PageCounters.box(page.getPageCounters()); } /** * ***************************************************************************** * Index * ****************************************************************************** */ @Override public int hashCode() { return url.hashCode(); } @Override public int compareTo(@NotNull WebPage o) { return url.compareTo(o.url); } @Override public boolean equals(Object other) { if (this == other) { return true; } return other instanceof WebPage && ((WebPage) other).url.equals(url); } @Override public String toString() { return new WebPageFormatter(this).format(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy