com.mdfromhtml.markdown.transform.ExtractHTMLJSON Maven / Gradle / Ivy

Go to download
/**
 * (c) Copyright 2019-2020 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;

/**
 * Read the text file produced issuing curl commands to the html_extractor to
 * create individual files for use in the ExtractHTMLJSON utility.
 */
public class ExtractHTMLJSON {

   /**
    * 
    */
   public ExtractHTMLJSON() {
   }

   /**
    * Main entry point to read a specified input directory to find text files
    * containing sequences of JSON Objects to be exgtracted and written to
    * separate files in the output directory.
    * 
    * @param args
    *           inputPath, and outputPath (if not supplied, the program prompts
    *           for their values)
    */
   public static void main(String[] args) {
      int exitVal = 0;
      ExtractHTMLJSON pgm = new ExtractHTMLJSON();
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated htmljson files (.json) " + "saved in "
               + pgm._outputPath); //
         }
         if (pgm._thumbsucker) {
            System.out
               .println("\nFilter strings used to check html for bad pages:");
            for (String filter : pgm._filters) {
               System.out.println(filter);
            }
            System.out.println();

         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  exitVal = pgm.doWork(file);
                  if (exitVal != 0) {
                     break;
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);
   }

   /**
    * Process the specified file to transform its content into formatted text
    * and save it to a txt file in the specified output directory.
    * 
    * @param file
    *           the file containing the annotation json from ICCC
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file) {
      int exitVal = 0;
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
         StringBuffer sb = new StringBuffer();
         String line = br.readLine();
         int linenum = 0;
         while (line != null) {
            linenum++;
            try {
            if (line.startsWith("}")) {
               sb.append(line);
               sb.append("\n");
               saveFile(sb.toString());
               sb = new StringBuffer();
            } else {
               sb.append(line);
               sb.append("\n");
            }
            line = br.readLine();
            } catch (OutOfMemoryError oome) {
               sb = new StringBuffer();
               System.out.println("Error reading line "+linenum);
               // read to next line starting with {
               line = br.readLine();
               linenum++;
               while (line != null) {
                  if (line.startsWith("{")) {
                     sb.append(line);
                     sb.append("\n");
                     line = br.readLine();
                     linenum++;
                     break;
                  }
                  line = br.readLine();
                  if (line.contains("\"url\":")) {
                     System.out.println("Skipping: "+line);
                  }
                  linenum++;
               }
               System.out.println("Resuming at line "+linenum);
            }
         }
         if (sb.length() > 0) {
            try {
            saveFile(sb.toString());
            } catch (Exception e) {
               System.out.println("\n\nError: "+e.getLocalizedMessage()+"\n");
               System.out.println(sb.toString());
               System.out.println("\n\nEnd Error: "+e.getLocalizedMessage()+"\n");
            }
         }
         MDfromHTMLUtils.closeTextFile(br);
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }

   /**
    * Saves the JSON content to a file in the output directory.
    * 
    * @param jsonContent
    *           JSON String to be saved as a file.
    */
   void saveFile(String jsonContent) {
      String outputFileName = _outputPath + _filePrefix
         + MDfromHTMLUtils.padLeft(_fileCounter++, 4, '0') + ".json";
      JSONObject obj = null;
      try {
         Object test = JSON.parse(jsonContent);
         if (test instanceof JSONObject) {
            obj = (JSONObject)test;
         } else {
            System.out.println("Error: got a non-JSONObject from parse: "+test);
            return;
         }
      } catch (IOException e) {
         System.out.println("Error: Can not transform to JSON: "
            + e.getLocalizedMessage() + "\n" + jsonContent);
         return;
      } catch (ClassCastException cce) {
         System.out.println("Error: Can not parse to JSON: "
                  + cce.getLocalizedMessage() + "\n" + jsonContent);
               return;
      }
      try {
         if (!filterContent(obj)) {
            MDfromHTMLUtils.saveJSONFile(outputFileName, obj);
            // System.out.println("Success: wrote file "+outputFileName);
         } else {
            MDfromHTMLUtils.saveJSONFile(outputFileName + ".rejected", obj);
            // System.out.println("Failure: wrote file "+outputFileName);
         }

      } catch (Exception e) {
         System.out.println("Can not save file " + outputFileName + "  Error: "
            + e.getLocalizedMessage());
      }
   }

   /**
    * Checks the html for filter strings and returns false of none are found. If
    * no html nor captureArray then returns true. If the capture array contains
    * an object that should be filtered, it is removed from the array.
    * 
    * @param jsonObj
    *           JSON object to be checked for filters.
    * @return
    */
   boolean filterContent(JSONObject jsonObj) {
      boolean result = true;
      if (jsonObj == null) {
         return result;
      }
      JSONArray rejectedURLs = new JSONArray();
      JSONArray captureArray = (JSONArray) jsonObj.get("captureArray");
      if (captureArray != null) {
         JSONObject htmlObj = new JSONObject();
         for (Iterator