com.plato.util.html.HTMLSearchableCompression Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of HTMLSearchableCompression Show documentation
Show all versions of HTMLSearchableCompression Show documentation
A utility library for compressing HTML while being able to search text.
The newest version!
package com.plato.util.html;
import org.mantoux.util.datastructures.InsertStringBuilder;
import java.util.Deque;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.plato.util.html.TagInstance.TAG_DELIMIT;
public class HTMLSearchableCompression {
static final String ESCAPE = "#";
private static final String TAGS_DELIMIT = ESCAPE + "tags" + ESCAPE;
private Deque tags;
private Deque selfClosings;
private String plainText;
public HTMLSearchableCompression() {
super();
this.tags = new LinkedList<>();
this.selfClosings = new LinkedList<>();
}
public Deque getTags() {
Deque tempTags = new LinkedList<>();
Deque cloneTags = new LinkedList<>();
for (TagInstance t : tags) {
tempTags.push(t);
}
for (TagInstance t : tempTags) {
cloneTags.push(t);
}
return cloneTags;
}
public Deque getSelfClosings() {
Deque tempTags = new LinkedList<>();
Deque cloneTags = new LinkedList<>();
for (TagInstance t : selfClosings) {
tempTags.push(t);
}
for (TagInstance t : tempTags) {
cloneTags.push(t);
}
return cloneTags;
}
public String getPlainText() {
return plainText;
}
public static String decode(String plain,
Deque inTags,
Deque sClosings) {
HTMLSearchableCompression c = new HTMLSearchableCompression();
c.tags = inTags;
c.selfClosings = sClosings;
c.plainText = plain;
InsertStringBuilder html = new InsertStringBuilder();
Deque ts = new LinkedList<>();
int index = plain.length();
TagInstance t;
/* Insert self closing tags in plain text */
c.processSelfClosingTags(sClosings, html, index);
/* Rebase input with included self closing tags */
index = c.plainText.length();
html = new InsertStringBuilder();
/* Process tags from the stack */
while (c.tags.peek() != null) {
t = ts.peek();
/* if next tag is closing before the opening of the last processed one */
if (t != null && c.tags.peek().to() <= t.from()) {
index = c.processOpeningTags(html, ts, index);
} else {
index = c.processClosingTags(html, ts, index);
}
}
while (ts.peek() != null) {
index = c.processOpeningTags(html, ts, index);
}
return html.insertFirst(c.plainText.substring(0, index)).toString();
}
public static HTMLSearchableCompression deserializeString(String in) {
HTMLSearchableCompression c = new HTMLSearchableCompression();
String[] tmp = in.split(TAGS_DELIMIT);
// if no tag
if (tmp.length < 2) {
return c;
}
for (String sTag : tmp[1].split(TAG_DELIMIT)) {
if (!"".equals(sTag.trim()))
c.tags.add(TagInstance.deserializeString(sTag, false));
}
// if no self closings
if (tmp.length < 3)
return c;
for (String sTag : tmp[2].split(TAG_DELIMIT)) {
if (!"".equals(sTag.trim()))
c.selfClosings.add(TagInstance.deserializeString(sTag, true));
}
return c;
}
public static String decode(String plain, String tags) {
HTMLSearchableCompression c = deserializeString(tags);
return decode(plain, c.getTags(), c.getSelfClosings());
}
/**
* Non-selfclosing tags are stored in a stack structure "tags"
* Self closing tags are stored in a list structure "selfClosings"
* Style attributes are stored within the tag instance
*/
public void encode(String in) {
Pattern pattern = Pattern.compile(Tag.getRegex());
Matcher m = pattern.matcher(in);
/* Helper variables */
int nextToParseIndex = 0;
StringBuilder sbPlainText = new StringBuilder();
// Structure to store opened tag not yet closed
Deque tempStack = new LinkedList<>();
// offset for regular tags
int offset = 0;
// offset for closing tags
int closingOffset = 0;
/* decoding algorithm */
while (m.find()) {
String sTag = in.substring(m.start(), m.end());
TagInstance tInstance;
if (sTag.charAt(1) != '/') {
tInstance = getTagOpening(m, tempStack, offset, closingOffset, sTag);
} else {
tInstance = getTagClosing(m, tempStack, offset, sTag);
}
sbPlainText.append(in.substring(nextToParseIndex, m.start()));
nextToParseIndex = m.start() + sTag.length();
if (!tInstance.tagName().isSelfClosing) {
offset += sTag.length();
} else
closingOffset += sTag.length();
}
plainText = sbPlainText.append(in.substring(nextToParseIndex)).toString();
}
/**
* Assumption : their are always regular tags, selfClosing tags are optional
*
* @return the String format of the compression
*/
public String serializeTagsString() {
StringBuilder s = new StringBuilder();
// Serialize regular tags
s.append(TAGS_DELIMIT);
for (TagInstance t : tags) {
s.append(TAG_DELIMIT).append(t.serializeString());
}
s.append(TAGS_DELIMIT);
for (TagInstance t : selfClosings) {
s.append(TAG_DELIMIT).append(t.serializeString());
}
return s.toString();
}
private TagInstance getTagOpening(Matcher m,
Deque tempStack,
int offset,
int closingOffset,
String sTag) {
TagInstance tInstance;
Tag tName = Tag.isTag(sTag);
if (tName != null && tName.isSelfClosing) {
tInstance = new TagInstance(sTag, m.start() - closingOffset - offset);
tInstance.findAttributes(sTag);
selfClosings.push(tInstance);
} else {
tInstance = new TagInstance(sTag, m.start() - offset);
tInstance.findAttributes(sTag);
tempStack.push(tInstance);
}
return tInstance;
}
private TagInstance getTagClosing(Matcher m,
Deque tempStack,
int offset,
String sTag) {
TagInstance tInstance;
tInstance = tempStack.pop();
String sTagName = Tag.prepareString(sTag);
if (tInstance == null || !tInstance.tagName().toString().equals(sTagName))
throw new IllegalArgumentException(
"Parser error - closing tag doesn't match current opening tag\n" + sTag);
tInstance.setRangeTo(m.start() - offset);
tags.push(tInstance);
return tInstance;
}
private void processSelfClosingTags(Deque sClosings,
InsertStringBuilder html,
int index) {
TagInstance t;
int idx = index;
while (sClosings.peek() != null) {
t = sClosings.pop();
// add concat is insert as it is faster than two inserts...
html.insertFirst(t.openingString() + plainText.substring(t.to(), idx));
idx = t.from();
}
html.insertFirst(plainText.substring(0, idx));
plainText = html.toString();
}
private int processOpeningTags(InsertStringBuilder html, Deque ts, int index) {
TagInstance t = ts.pop();
String s = plainText.substring(t.from(), index);
html.insertFirst(t.openingString() + s);
return t.from();
}
private int processClosingTags(InsertStringBuilder html, Deque ts, int index) {
TagInstance t = tags.pop();
String s = plainText.substring(t.to(), index);
html.insertFirst(t.closingString() + s);
ts.push(t);
return t.to();
}
static String notBewteenQuotesRegex(String s) {
return s + "(?:(?<=[\"]" + s + ")|(?=[\"]))";
}
}