All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.ql.common.io.ValueDomWritable Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.ql.common.io;

import ai.platon.pulsar.common.concurrent.ConcurrentLRUCache;
import ai.platon.pulsar.dom.Documents;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.ql.common.types.ValueDom;
import org.apache.hadoop.io.Writable;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.time.Duration;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

public class ValueDomWritable implements Writable {

    public static int CACHE_SIZE = 200;
    private static FeaturedDocument NIL_DOC = FeaturedDocument.Companion.getNIL();
    private static String NIL_DOC_HTML = FeaturedDocument.Companion.getNIL_DOC_HTML();
    private static int NIL_DOC_LENGTH = FeaturedDocument.Companion.getNIL_DOC_LENGTH();
    private static Duration CACHE_EXPIRES = Duration.ofMinutes(10);
    private static String CACHED_HINT = "(cached)";

    // server side
    // TODO: check if this is client side or server side, ensure items in client side lives longer than that in server side
    private static ConcurrentLRUCache pageCache = new ConcurrentLRUCache<>(CACHE_EXPIRES, CACHE_SIZE);

    // client side
    private static Map documentCache = new ConcurrentHashMap<>();

    private ValueDom dom;

    public ValueDomWritable() {}

    public ValueDomWritable(ValueDom dom) {
        this.dom = dom;
    }

    public ValueDom get() {
        return dom;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        Element ele = dom.getElement();
        Document doc = ele.ownerDocument();
        String baseUri = doc.baseUri();

        out.writeBytes(doc.baseUri());
        out.write('\n'); // make a line

        out.writeBytes(ele.cssSelector());
        out.write('\n'); // make a line

        String html = pageCache.get(baseUri);
        if (html != null && !html.isEmpty()) {
            // tell the client it's cached
            html = CACHED_HINT;
        } else {
            // not cached, cache it
            html = doc.outerHtml();
            pageCache.put(baseUri, html);
        }

        out.writeInt(html.length());
        out.write(html.getBytes());
    }

    /**
     * TODO: The local cache might not sync with the server side which lead to no data in client side, we need a better solution
     * */
    @Override
    public void readFields(DataInput in) throws IOException {
        String baseUri = in.readLine();
        String selector = in.readLine();
        int htmlLen = in.readInt();

        String html;
        FeaturedDocument doc;
        if (htmlLen == CACHED_HINT.length()) {
            // cached
            doc = documentCache.get(baseUri);
            in.skipBytes(htmlLen);
        } else {
            // not cached
            byte[] bytes = new byte[htmlLen];
            in.readFully(bytes);
            html = new String(bytes);

            doc = Documents.INSTANCE.parse(html, baseUri);
            documentCache.put(baseUri, doc);
        }

        if (doc == null) {
            doc = NIL_DOC;
        }

        Element ele;
        if (selector.equals("#root")) {
            ele = doc.unbox();
        } else {
            ele = doc.selectFirst(selector);
        }

        if (ele == null) {
            ele = NIL_DOC.getBody();
        }

        dom = ValueDom.get(ele);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy