All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.boilerpipe.document.TextDocument Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
/**
 * Created by vincent on 16-11-9.
 * Copyright @ 2013-2016 Platon AI. All rights reserved
 */
package ai.platon.pulsar.boilerpipe.document;

import ai.platon.pulsar.boilerpipe.utils.PageCategory;

import java.time.Instant;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

public class TextDocument implements Cloneable {

  /** Scent Stat */
  public class Stat {
    public int _char = 0;
    public int _a = 0;
    public int _img = 0;
  }

  private final List textBlocks;
  private String baseUrl;
  private String pageTitle;
  private String contentTitle;
  private Instant publishTime = Instant.EPOCH;
  private Instant modifiedTime = Instant.EPOCH;
  private int dateTimeCount = 0;
  private Stat stat = new Stat();
  private PageCategory pageCategory = PageCategory.UNKNOWN;
  private Map fields = new LinkedHashMap<>();

  public TextDocument(String baseUrl, String pageTitle, List textBlocks) {
    this.baseUrl = baseUrl;
    this.pageTitle = pageTitle;
    this.textBlocks = textBlocks;
  }

  public List getTextBlocks() { return textBlocks; }

  public void setBaseUrl(String baseUrl) { this.baseUrl = baseUrl; }

  public String getBaseUrl() { return baseUrl; }

  public String getPageTitle() { return pageTitle == null ? "" : pageTitle; }

  public void setPageTitle(String pageTitle) { this.pageTitle = pageTitle; }

  public String getContentTitle() { return contentTitle == null ? "" : contentTitle; }

  public void setContentTitle(String contentTitle) { this.contentTitle = contentTitle; }

  public Instant getPublishTime() { return publishTime; }

  public void setPublishTime(Instant publishTime) { this.publishTime = publishTime; }

  public Instant getModifiedTime() { return modifiedTime; }

  public void setModifiedTime(Instant modifiedTime) { this.modifiedTime = modifiedTime; }

  public int getDateTimeCount() { return dateTimeCount; }

  public void setDateTimeCount(int dateTimeCount) { this.dateTimeCount = dateTimeCount; }

  public void setStat(Stat stat) { this.stat = stat; }

  public Stat getStat() { return this.stat; }

  public PageCategory getPageCategory() {
    return pageCategory;
  }

  public String getPageCategoryAsString() {
    return pageCategory.toString();
  }

  public void setPageCategory(PageCategory pageCategory) {
    this.pageCategory = pageCategory;
  }

  public Map getFields() {
    return fields;
  }

  public String getField(String name) {
    return fields.get(name);
  }

  public String getFieldOrDefault(String name, String defaultValue) {
    return fields.getOrDefault(name, defaultValue);
  }

  public void setField(String name, String value) {
    fields.put(name, value);
  }

  public String getTextContent() {
    return getTextContent(true, false);
  }

  public String getTextContent(boolean includeContent, boolean includeNonContent) {
    StringBuilder sb = new StringBuilder();
    for (TextBlock block : getTextBlocks()) {
      if (block.isContent()) {
        if (!includeContent) {
          continue;
        }
      } else {
        if (!includeNonContent) {
          continue;
        }
      }
      sb.append(block.getText());
      sb.append('\n');
    }
    return sb.toString();
  }

  public String getHtmlContent() {
    return getHtmlContent(true, false);
  }

  public String getHtmlContent(boolean includeContent, boolean includeNonContent) {
    StringBuilder sb = new StringBuilder();
    sb.append("
"); LOOP: for (TextBlock block : getTextBlocks()) { if (block.isContent()) { if (!includeContent) { continue LOOP; } } else { if (!includeNonContent) { continue LOOP; } } if (block.getRichText().length() > 1) { sb.append("
"); sb.append(block.getRichText()); sb.append("
\n"); } } sb.append("
"); return sb.toString(); } public String debugString() { StringBuilder sb = new StringBuilder(); for (TextBlock tb : getTextBlocks()) { if (!tb.getLabels().isEmpty()) { sb.append("[LABLE : " + tb.getLabels() + "]"); } sb.append(tb.toString()); sb.append('\n'); } return sb.toString(); } @Override public TextDocument clone() { final List list = new ArrayList<>(textBlocks.size()); for (TextBlock tb : textBlocks) { list.add(tb.clone()); } return new TextDocument(baseUrl, pageTitle, list); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy