All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.ExtractHTMLJSON Maven / Gradle / Ivy

/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;

/**
 * Read the text file produced issuing curl commands to the html_extractor to
 * create individual files for use in the ExtractHTMLJSON utility.
 */
public class ExtractHTMLJSON {

   /**
    * 
    */
   public ExtractHTMLJSON() {
   }

   /**
    * Main entry point to read a specified input directory to find text files
    * containing sequences of JSON Objects to be exgtracted and written to
    * separate files in the output directory.
    * 
    * @param args
    *           inputPath, and outputPath (if not supplied, the program prompts
    *           for their values)
    */
   public static void main(String[] args) {
      int exitVal = 0;
      ExtractHTMLJSON pgm = new ExtractHTMLJSON();
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated htmljson files (.json) " + "saved in "
               + pgm._outputPath); //
         }
         if (pgm._thumbsucker) {
            System.out
               .println("\nFilter strings used to check html for bad pages:");
            for (String filter : pgm._filters) {
               System.out.println(filter);
            }
            System.out.println();

         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  exitVal = pgm.doWork(file);
                  if (exitVal != 0) {
                     break;
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);
   }

   /**
    * Process the specified file to transform its content into formatted text
    * and save it to a txt file in the specified output directory.
    * 
    * @param file
    *           the file containing the annotation json from ICCC
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file) {
      int exitVal = 0;
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
         StringBuffer sb = new StringBuffer();
         String line = br.readLine();
         int linenum = 0;
         while (line != null) {
            linenum++;
            try {
            if (line.startsWith("}")) {
               sb.append(line);
               sb.append("\n");
               saveFile(sb.toString());
               sb = new StringBuffer();
            } else {
               sb.append(line);
               sb.append("\n");
            }
            line = br.readLine();
            } catch (OutOfMemoryError oome) {
               sb = new StringBuffer();
               System.out.println("Error reading line "+linenum);
               // read to next line starting with {
               line = br.readLine();
               linenum++;
               while (line != null) {
                  if (line.startsWith("{")) {
                     sb.append(line);
                     sb.append("\n");
                     line = br.readLine();
                     linenum++;
                     break;
                  }
                  line = br.readLine();
                  if (line.contains("\"url\":")) {
                     System.out.println("Skipping: "+line);
                  }
                  linenum++;
               }
               System.out.println("Resuming at line "+linenum);
            }
         }
         if (sb.length() > 0) {
            try {
            saveFile(sb.toString());
            } catch (Exception e) {
               System.out.println("\n\nError: "+e.getLocalizedMessage()+"\n");
               System.out.println(sb.toString());
               System.out.println("\n\nEnd Error: "+e.getLocalizedMessage()+"\n");
            }
         }
         MDfromHTMLUtils.closeTextFile(br);
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }

   /**
    * Saves the JSON content to a file in the output directory.
    * 
    * @param jsonContent
    *           JSON String to be saved as a file.
    */
   void saveFile(String jsonContent) {
      String outputFileName = _outputPath + _filePrefix
         + MDfromHTMLUtils.padLeft(_fileCounter++, 4, '0') + ".json";
      JSONObject obj = null;
      try {
         Object test = JSON.parse(jsonContent);
         if (test instanceof JSONObject) {
            obj = (JSONObject)test;
         } else {
            System.out.println("Error: got a non-JSONObject from parse: "+test);
            return;
         }
      } catch (IOException e) {
         System.out.println("Error: Can not transform to JSON: "
            + e.getLocalizedMessage() + "\n" + jsonContent);
         return;
      } catch (ClassCastException cce) {
         System.out.println("Error: Can not parse to JSON: "
                  + cce.getLocalizedMessage() + "\n" + jsonContent);
               return;
      }
      try {
         if (!filterContent(obj)) {
            MDfromHTMLUtils.saveJSONFile(outputFileName, obj);
            // System.out.println("Success: wrote file "+outputFileName);
         } else {
            MDfromHTMLUtils.saveJSONFile(outputFileName + ".rejected", obj);
            // System.out.println("Failure: wrote file "+outputFileName);
         }

      } catch (Exception e) {
         System.out.println("Can not save file " + outputFileName + "  Error: "
            + e.getLocalizedMessage());
      }
   }

   /**
    * Checks the html for filter strings and returns false of none are found. If
    * no html nor captureArray then returns true. If the capture array contains
    * an object that should be filtered, it is removed from the array.
    * 
    * @param jsonObj
    *           JSON object to be checked for filters.
    * @return
    */
   boolean filterContent(JSONObject jsonObj) {
      boolean result = true;
      if (jsonObj == null) {
         return result;
      }
      JSONArray rejectedURLs = new JSONArray();
      JSONArray captureArray = (JSONArray) jsonObj.get("captureArray");
      if (captureArray != null) {
         JSONObject htmlObj = new JSONObject();
         for (Iterator it = captureArray.iterator(); it.hasNext();) {
            htmlObj = (JSONObject) it.next();
            String url = (String) htmlObj.get("url");
            if (url.endsWith("/")) {
               url = url.substring(0,url.length()-1);
            }
            if (_processedURLs.contains(url)) {
               JSONObject rejected = new JSONObject();
               rejected.put("url",url);
               rejected.put("reason","duplicate url");
               rejectedURLs.add(rejected);
               it.remove();
               continue;
            }
            String html = (String) htmlObj.get("html");
            // expect html to have a body tag otherwise, reject it
            if (html != null && html.toLowerCase().indexOf(" 0);
      } else {
         String reason = "Filter: Content at " + _fileCounter
                  + " does not have a captureArray. message: "+jsonObj.get("message");
         JSONObject rejected = new JSONObject();
         rejected.put("reason",reason);
         rejectedURLs.add(rejected);

         System.out.println(reason);
      }
      if (rejectedURLs.size() > 0) {
         jsonObj.put("rejected",rejectedURLs);
      }
      return result;
   }

   /**
    * Get the parameters necessary for program execution: input directory, and
    * output directory
    * 
    * @param args
    *           inputPath, outputPath
    * @return true if we have sufficient parameters to execute the program
    */
   boolean getParams(String[] args) {
      String inputPath = "./data/";
      String outputPath = "./data/htmljson";
      String tmp = "";

      try {
         if (args.length >= 1) {
            inputPath = args[0];
         } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
               "Enter the fully qualified path to directory containing " + _ext
                  + " html capture files, or q to exit (" + inputPath + "):");
            if (tmp == null || tmp.length() == 0) {
               tmp = inputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
               return false;
            }
            inputPath = tmp;
         }
         if (inputPath.endsWith(File.separator) == false) {
            inputPath += File.separator;
         }
         _inputPath = FileSystems.getDefault().getPath(inputPath);
      } catch (InvalidPathException ipe) {
         System.out.println(
            "Error: " + args[0] + " is not a valid directory to form a path.");
         return false;
      }
      if (args.length >= 2) {
         outputPath = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the htmljson output directory, or q to exit ("
               + outputPath + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = outputPath;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         outputPath = tmp;
      }
      if (outputPath.endsWith(File.separator) == false) {
         outputPath += File.separator;
      }
      File testOutput = new File(outputPath);
      if (testOutput.exists() == false) {
         System.out.println(
            "Error: The output directory \"" + outputPath + "\" must exist.");
         return false;
      }
      if (testOutput.isDirectory() == false) {
         System.out.println("Error: The output directory \"" + outputPath
            + "\" must be a directory.");
         return false;
      }
      _outputPath = outputPath;

      if (args.length >= 3) {
         outputPath = args[2];
      } else {
         tmp = MDfromHTMLUtils.prompt("Enter the starting file suffix or q to quit ("
            + _fileCounter + "):");
         if (tmp.length() == 0) {
            tmp = "" + _fileCounter;
         }
         if ("q".equalsIgnoreCase(tmp)) {
            return false;
         }
         try {
            int test = new Integer(tmp);
            if (test < 1) {
               System.out.println("File suffix must be a positive number.");
               return false;
            }
            _fileCounter = test;
         } catch (NumberFormatException nfe) {
            System.out.println(
               "File suffix must be a positive number. Got \"" + tmp + "\"");
            return false;
         }
      }

      if (args.length >= 4) {
         _thumbsucker = new Boolean(args[3]);
      }

      try {
         _filters = MDfromHTMLUtils.loadTextFile(_inputPath+File.separator+"RejectStrings.txt");
         // rewrite filters in lowercase
         List newFilters = new ArrayList();
         for (String filter : _filters) {
            if (filter.startsWith("#")) {
               // skip comments
               continue;
            }
            filter = filter.trim();
            if (filter.length() == 0) {
               // skip empty lines
               continue;
            }
            newFilters.add(filter.toLowerCase());
         }
         _filters = newFilters;
      } catch (Exception e) {
         e.printStackTrace();
         return false;
      }
      return true;
   }

   String _ext = "text";
   Path _inputPath = null;
   boolean _interactive = false;
   String _outputPath = ".";
   boolean _thumbsucker = false;
   String _filePrefix = "htmljson_";
   int _fileCounter = 1;
   List _filters = new ArrayList();
   Set_processedURLs = new HashSet();

}