All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pageseeder.ox.psml.validation.CharactersValidator Maven / Gradle / Ivy

/*
 * Copyright 2021 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.ox.psml.validation;

import org.pageseeder.xmlwriter.XMLWriter;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class CharactersValidator {

  /**
   * Validate well formed XML
   *
   * @param original the file to validate
   *
   * @return the result of the validation, containing errors and if it was actually validated
   */
  public ValidationResult validateCharacters(File original) {
    String error;
    try (InputStream reader = new FileInputStream(original)) {
      CharactersData data = new CharactersData();
      List errors = new ArrayList<>();
      int linenb = 1;
      int colnb = 1;
      byte[] buffer = new byte[1024 * 4];
      int read;
      while ((read = reader.read(buffer)) != -1) {
        String s = new String(buffer, 0, read, StandardCharsets.UTF_8);
        for (int i = 0; i < s.length(); i++) {
          char c = s.charAt(i);
          if (c == '\n') {
            linenb++;
            colnb = 0;
          } else if (invalidChar(c)) {
            data.addChar(c);
            errors.add("Suspicious character '"+c+"' found at line "+linenb+", character "+colnb+". It can be replaced by entity &#x"+Integer.toHexString(c)+";");
          }
          colnb++;
        }
      }
      return new ValidationResult("characters", true, null, errors, data);
    } catch (IOException ex) {
      error = "Error when inspecting file: " + ex.getMessage();
    }
    return new ValidationResult("characters", true, null, error);
  }

  private boolean invalidChar(char c) {
    return Character.UnicodeBlock.of(c) != Character.UnicodeBlock.BASIC_LATIN;
  }

  public static class CharactersData implements ValidationResult.ExtraData {
    private Map characters = new HashMap<>();
    void addChar(char c) {
      Integer v = this.characters.get(c);
      if (v == null) this.characters.put(c, 1);
      else this.characters.put(c, v.intValue() + 1);
    }
    @Override
    public void toXML(XMLWriter xml) throws IOException {
      xml.openElement("characters");
      for (Character c : this.characters.keySet()) {
        xml.openElement("character");
        xml.attribute("value", c);
        xml.attribute("occurrence", this.characters.get(c));
        xml.closeElement();
      }
      xml.closeElement();
    }
  }

  public static void charactersDataToPSML(CharactersData data, XMLWriter psml) throws IOException {
    if (data == null || data.characters.isEmpty()) return;
    psml.openElement("section");
    psml.attribute("id", "extra");
    psml.openElement("fragment");
    psml.attribute("id", "extra");
    psml.openElement("table");
    psml.writeXML("CharacterEntityOccurrences");
    for (Character c : data.characters.keySet()) {
      psml.openElement("row");
      psml.element("hcell", c.toString());
      psml.element("cell", "&#x"+Integer.toHexString(c)+";");
      psml.element("cell", data.characters.get(c).toString());
      psml.closeElement();
    }
    psml.closeElement(); // table
    psml.closeElement(); // fragment
    psml.closeElement(); // section
  }

  public static void charactersDataToPSML(Map datas, XMLWriter psml) throws IOException {
    if (datas == null || datas.isEmpty()) return;
    Map characters = new HashMap<>();
    for (String path : datas.keySet()) {
      ValidationResult.ExtraData data = datas.get(path);
      if (data instanceof CharactersData) {
        boolean newdoc = true;
        for (Character c : ((CharactersData) data).characters.keySet()) {
          CharacterData cdata = characters.get(c);
          if (cdata == null) characters.put(c, new CharacterData(path));
          else {
            if (newdoc) cdata.files.add(path);
            cdata.occurrence++;
          }
          newdoc = false;
        }
      }
    }
    if (characters.isEmpty()) return;
    psml.openElement("section");
    psml.attribute("id", "extra");
    psml.openElement("fragment");
    psml.attribute("id", "extra");
    psml.openElement("table");
    psml.writeXML("");
    psml.writeXML("CharacterEntityOccurrences");
    for (Character c : characters.keySet()) {
      String id = String.valueOf(c.hashCode());
      CharacterData cdata = characters.get(c);
      psml.openElement("row");
      psml.element("hcell", c.toString());
      psml.element("hcell", "&#x"+Integer.toHexString(c)+";");
      psml.openElement("cell");
      psml.writeXML(""+cdata.occurrence+" in "+cdata.files.size()+" document"+(cdata.files.size()==1?"":"s")+"");
      psml.openElement("list");
      psml.attribute("role", "toggle-c"+id);
      int count = 0;
      for (String p : cdata.files) {
        psml.element("item", p);
        if (count++ > 20) {
          psml.element("item", "Only first 20 documents shown...");
          break;
        }
      }
      psml.closeElement(); // list
      psml.closeElement(); // cell
      psml.closeElement(); // row
    }
    psml.closeElement(); // table
    psml.closeElement(); // fragment
    psml.closeElement(); // section
  }

  private static class CharacterData {
    int occurrence = 1;
    List files = new ArrayList<>();
    CharacterData(String p) { this.files.add(p); }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy