com.adobe.epubcheck.ctc.EntitySearch Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of epubcheck Show documentation
Show all versions of epubcheck Show documentation
EPUBCheck is a tool to validate the conformance of EPUB publications against
the EPUB specifications. EPUBCheck can be run as a standalone command-line tool or used
as a Java library.
package com.adobe.epubcheck.ctc;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Scanner;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.messages.MessageId;
import com.adobe.epubcheck.ocf.EncryptionFilter;
import com.adobe.epubcheck.util.EPUBVersion;
/**
* === WARNING ==========================================
* This class is scheduled to be refactored and integrated
* in another package.
* Please keep changes minimal (bug fixes only) until then.
* ========================================================
*/
class EntitySearch extends TextSearch
{
static final Pattern entityPattern = Pattern.compile("&([A-Za-z0-9]+)([;|\\s])");
static final HashSet legalEntities2_0;
static final HashSet legalEntities3_0;
MessageId ENTITY_INVALID = MessageId.HTM_023;
MessageId ENTITY_OK = MessageId.HTM_006;
MessageId ENTITY_IMPROPER = MessageId.HTM_024;
static
{
legalEntities3_0 = new HashSet();
Collections.addAll(legalEntities3_0, "&", "'", """, "<", ">");
legalEntities2_0 = new HashSet();
Collections.addAll(legalEntities2_0, " ", "¡", "¢", "£", "¤",
"¥", "¦", "§", "¨", "©", "ª", "«", "¬", "", "®",
"¯", "°", "±", "²", "³", "´", "µ", "¶", "·", "¸",
"¹", "º", "»", "¼", "½", "¾", "¿", "À", "Á",
"Â", "Ã", "Ä", "Å", "Æ", "Ç", "È", "É", "Ê", "Ë",
"Ì", "Í", "Î", "Ï", "Ð", "Ñ", "Ò", "Ó", "Ô", "Õ",
"Ö", "×", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "Þ", "ß",
"à", "á", "â", "ã", "ä", "å", "æ", "ç", "è", "é",
"ê", "ë", "ì", "í", "î", "ï", "ð", "ñ", "ò", "ó",
"ô", "õ", "ö", "÷", "ø", "ù", "ú", "û", "ü", "ý",
"þ", "ÿ", "Œ", "œ", "Š", "š", "Ÿ", "ƒ", "ˆ", "˜",
"Α", "Β", "Γ", "Δ", "Ε", "Ζ", "Η", "Θ", "Ι", "Κ",
"Λ", "Μ", "Ν", "Ξ", "Ο", "Π", "Ρ", "Σ", "Τ", "Υ", "Φ", "Χ",
"Ψ", "Ω", "α", "β", "γ", "δ", "ε", "ζ", "η", "θ", "ι",
"κ", "λ", "μ", "ν", "ξ", "ο", "π", "ρ", "ς", "σ", "τ",
"υ", "φ", "χ", "ψ", "ω", "ϑ", "ϒ", "ϖ", " ", " ", " ",
"", "", "", "", "–", "—", "‘", "’", "‚", "“", "”",
"„", "†", "‡", "•", "…", "‰", "′", "″", "‹", "›",
"‾", "⁄", "€", "ℑ", "℘", "ℜ", "™", "ℵ", "←", "↑",
"→", "↓", "↔", "↵", "⇐", "⇑", "⇒", "⇓", "⇔", "∀", "∂",
"∃", "∅", "∇", "∈", "∉", "∋", "∏", "∑", "−", "∗", "√",
"∝", "∞", "∠", "∧", "∨", "∩", "∪", "∫", "∴", "∼", "≅", "≈",
"≠", "≡", "≤", "≥", "⊂", "⊃", "⊄", "⊆", "⊇", "⊕", "⊗", "⊥",
"⋅", "⋮", "⌈", "⌉", "⌊", "⌋", "〈", "〉", "◊", "♠", "♣",
"♥", "♦");
}
public EntitySearch(EPUBVersion version, ZipFile zip, Report report)
{
super(version, zip, report);
}
public Vector Search(String entry)
{
Vector result = new Vector();
InputStream is = null;
Scanner in = null;
try
{
is = getInputStream(entry);
in = new Scanner(is);
int lineCounter = 1;
while (in.hasNextLine())
{
String line = in.nextLine();
Matcher matcher = entityPattern.matcher(line);
int position = 0;
while (matcher.find(position))
{
MessageId messageCode = ENTITY_INVALID;
position = matcher.end();
String matchedText = line.substring(matcher.start(), matcher.end());
if (version == EPUBVersion.VERSION_2)
{
if (legalEntities3_0.contains(matchedText) || legalEntities2_0.contains(matchedText))
{
// its in either the legal 2.0 list or the 3.0 list. Simply emit a usage message
messageCode = ENTITY_OK;
}
}
else if (version == EPUBVersion.VERSION_3)
{
if (legalEntities3_0.contains(matchedText))
{
// its in the 3.0 list. just emit a usage message
messageCode = ENTITY_OK;
}
else if (legalEntities2_0.contains(matchedText))
{
// its in the 2.0 list. Emit a usage message saying that only & ' etc. are allowed
messageCode = ENTITY_IMPROPER;
}
}
int contextStart = Math.max(0, matcher.start() - 20);
int contextEnd = Math.min(contextStart + 40, line.length() - 1);
String context = line.substring(contextStart, contextEnd);
if (messageCode == ENTITY_INVALID)
{
// emit the erroneous text along with the message
report.message(messageCode, EPUBLocation.create(entry, lineCounter, matcher.start(), context.trim()), matchedText);
}
else
{
report.message(messageCode, EPUBLocation.create(entry, lineCounter, matcher.start(), context.trim()));
}
}
lineCounter++;
}
}
catch (FileNotFoundException e1)
{
String fileName = new File(zip.getName()).getName();
report.message(MessageId.RSC_001, EPUBLocation.create(fileName), entry);
}
catch (IOException e1)
{
String fileName = new File(zip.getName()).getName();
report.message(MessageId.PKG_008, EPUBLocation.create(fileName), entry);
}
catch (Exception e)
{
e.printStackTrace();
report.message(MessageId.RSC_005, EPUBLocation.create(entry), e.getMessage());
}
finally
{
if (is != null)
{
try
{
is.close();
}
catch (Exception ignored)
{
}
}
if (in != null) {
in.close();
}
}
return result;
}
}