All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.GetHTMLJSONfromHTML Maven / Gradle / Ivy

/**
 * (c) Copyright 2019-2023 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;

/**
 * 
 */

public class GetHTMLJSONfromHTML {

	/**
	 * @param args
	 */

	public static void main(String[] args) {
      int exitVal = 0;
		GetHTMLJSONfromHTML pgm = new GetHTMLJSONfromHTML();
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated html json file "
               + "saved in " + pgm._outputPath); //
         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
            	List inputLines = MDfromHTMLUtils.loadTextFile(pgm._inputMapFile);
            	Map html2urlMap = new HashMap();
            	String[] parts = null;
            	int lineNum = 0;
            	for (String inputLine: inputLines) {
            		lineNum++;
            		parts = inputLine.split("\t");
            		if (parts.length != 2) {
            			System.out.println("Errant line: "+lineNum+": "+inputLine);
            			continue;
            		}
            		html2urlMap.put(parts[1].trim(), parts[0].trim());
            	}
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  try {
                     exitVal = pgm.doWork(file,html2urlMap);
                     if (exitVal != 0) {
                        break;
                     }
                  } catch (Exception e) {
                     e.printStackTrace();
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);

	}
	
	/**
	 * escape()
	 *
	 * Escape a given input String of HTML to enable it to be stored in JSON as a quoted String value 
	 * retaining its characteristics when retrieved and referenced as a Java String.
	 *
	 * @param str The input String.
	 * @return The output String.
	 **/
	public String escapeString(String str){
	  return str.replace("\\", "\\\\")
	          .replace("\b", "\\b")
	          .replace("\f", "\\f")
	          .replace("\n", "\\n")
	          .replace("\r", "\\r")
	          .replace("\t", "\\t")
	          .replace("\'", "\\'")
	          .replace("\"", "\\\"");
	}
	
   /**
    * Process the specified html file to transform its content into an htmljson
    * file and save it to the specified output directory.
    * 
    * @param file
    *           the file containing the raw html captured by as text
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file, Maphtml2urlMap) {
      int exitVal = 0;
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         JSONObject outputJsonFile = new JSONObject();
         JSONArray captureArray = new JSONArray();
         outputJsonFile.put("captureArray", captureArray);
         JSONObject outputJson = new JSONObject();
         String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
         int index = shortFileName.lastIndexOf("." + _ext);
         if (index < 1) {
            System.out.println(fqFileName + " doesn't end with ." + _ext);
            exitVal = -1;
         } else {
         	String outputFileName = _outputPath+shortFileName.substring(0,index)+".json";
            outputJson.put("url", html2urlMap.get(shortFileName));
            BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
            StringBuffer sb = new StringBuffer();
            String line = br.readLine();
            String test = "";
            while (line != null) {
               // remove all indentation and trailing whitespace
               test = line.trim();
               // skip empty lines
               if (test.length() == 0) {
               	line = br.readLine();
               	continue;
               }
               // preserve linebreaks
               sb.append(line);
               line = br.readLine();
            }
            outputJson.put("html", sb.toString());
            captureArray.add(outputJson);
            MDfromHTMLUtils.saveJSONFile(outputFileName, outputJsonFile);
         }
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }


   /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
   boolean getParams(String[] args) {
      String inputPath = "." + File.separator + "data" + File.separator
         + "htmls";
      String outputPath = "." + File.separator + "data" + File.separator + "htmljson";
      String inputMapFile = "." + File.separator + "data" + File.separator + "urls_to_htmls.tsv";
      String tmp = "";

      try {
         if (args.length >= 1) {
            inputPath = args[0];
         } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
               "Enter the fully qualified path to directory containing " + _ext
                  + " capture files, or q to exit (" + inputPath + "):");
            if (tmp == null || tmp.length() == 0) {
               tmp = inputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
               return false;
            }
            inputPath = tmp;
         }
         if (inputPath.endsWith(File.separator) == false) {
            inputPath += File.separator;
         }
         _inputPath = FileSystems.getDefault().getPath(inputPath);
      } catch (InvalidPathException ipe) {
         System.out.println(
            "Error: " + args[0] + " is not a valid directory to form a path.");
         return false;
      }
      if (args.length >= 2) {
         outputPath = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the htmljson output directory, or q to exit ("
               + outputPath + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = outputPath;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         outputPath = tmp;
      }
      if (outputPath.endsWith(File.separator) == false) {
         outputPath += File.separator;
      }
      File testOutput = new File(outputPath);
      if (testOutput.exists() == false) {
         System.out.println(
            "Error: The output directory \"" + outputPath + "\" must exist.");
         return false;
      }
      if (testOutput.isDirectory() == false) {
         System.out.println("Error: The output directory \"" + outputPath
            + "\" must be a directory.");
         return false;
      }
      _outputPath = outputPath;
      
      if (args.length >= 3) {
         inputMapFile = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the url to html file tsv file, or q to exit ("
               + inputMapFile + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = inputMapFile;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         inputMapFile = tmp;
      }
      File testInputMapFile = new File(inputMapFile);
      if (testInputMapFile.exists() == false) {
         System.out.println(
            "Error: The url to html file tsv file \"" + inputMapFile + "\" must exist.");
         return false;
      }
      if (testInputMapFile.isDirectory() == true) {
         System.out.println("Error: The entry can not be a directory \"" + inputMapFile
            + "\" must be a tsv file.");
         return false;
      }
      _inputMapFile = inputMapFile;

      if (args.length >= 4) {
         _thumbsucker = new Boolean(args[3]);
      }

      return true;
   }

   String _ext = "html";
   Path _inputPath = null;
   String _inputMapFile = "";
   boolean _interactive = false;
   String _outputPath = ".";
   boolean _thumbsucker = false;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy