All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mdfromhtml.markdown.transform.GetHTMLJSONfromHTML Maven / Gradle / Ivy

Go to download

Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.

There is a newer version: 2.0.18
Show newest version
/**
 * (c) Copyright 2019-2023 IBM Corporation
 * 1 New Orchard Road, 
 * Armonk, New York, 10504-1722
 * United States
 * +1 914 499 1900
 * support: Nathaniel Mills [email protected]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package com.mdfromhtml.markdown.transform;

import java.io.BufferedReader;
import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;

/**
 * 
 */

public class GetHTMLJSONfromHTML {

	/**
	 * @param args
	 */

	public static void main(String[] args) {
      int exitVal = 0;
		GetHTMLJSONfromHTML pgm = new GetHTMLJSONfromHTML();
      if (pgm.getParams(args)) {
         if (pgm._thumbsucker) {
            System.out.println("\nFiles ending with ." + pgm._ext
               + " will be read from " + pgm._inputPath //
               + "\nand the generated html json file "
               + "saved in " + pgm._outputPath); //
         }
         if (pgm._interactive) {
            if (MDfromHTMLUtils
               .prompt("Press q to quit or press Enter to continue...")
               .length() == 0) {
               pgm._interactive = false;
            }
         }
         if (!pgm._interactive) {
            try {
            	List inputLines = MDfromHTMLUtils.loadTextFile(pgm._inputMapFile);
            	Map html2urlMap = new HashMap();
            	String[] parts = null;
            	int lineNum = 0;
            	for (String inputLine: inputLines) {
            		lineNum++;
            		parts = inputLine.split("\t");
            		if (parts.length != 2) {
            			System.out.println("Errant line: "+lineNum+": "+inputLine);
            			continue;
            		}
            		html2urlMap.put(parts[1].trim(), parts[0].trim());
            	}
               List files = MDfromHTMLUtils.listSourceFiles(
                  FileSystems.getDefault().getPath(pgm._inputPath.toString()),
                  pgm._ext);
               for (Path file : files) {
                  try {
                     exitVal = pgm.doWork(file,html2urlMap);
                     if (exitVal != 0) {
                        break;
                     }
                  } catch (Exception e) {
                     e.printStackTrace();
                  }
               }
            } catch (Exception e) {
               System.out
                  .println("Error: Can not reference files with extension "
                     + pgm._ext + " in directory " + pgm._inputPath
                     + " reason: " + e.getLocalizedMessage());
               exitVal = -1;
            }
         }
         if (pgm._thumbsucker) {
            System.out.println();
         }
      } else {
         exitVal = -1;
      }
      if (pgm._thumbsucker) {
         System.out.println("Goodbye");
      }
      System.exit(exitVal);

	}
	
	/**
	 * escape()
	 *
	 * Escape a given input String of HTML to enable it to be stored in JSON as a quoted String value 
	 * retaining its characteristics when retrieved and referenced as a Java String.
	 *
	 * @param str The input String.
	 * @return The output String.
	 **/
	public String escapeString(String str){
	  return str.replace("\\", "\\\\")
	          .replace("\b", "\\b")
	          .replace("\f", "\\f")
	          .replace("\n", "\\n")
	          .replace("\r", "\\r")
	          .replace("\t", "\\t")
	          .replace("\'", "\\'")
	          .replace("\"", "\\\"");
	}
	
   /**
    * Process the specified html file to transform its content into an htmljson
    * file and save it to the specified output directory.
    * 
    * @param file
    *           the file containing the raw html captured by as text
    * @return exit value (0 indicates success, otherwise -1 for failure)
    */
   int doWork(Path file, Maphtml2urlMap) {
      int exitVal = 0;
      try {
         String fqFileName = file.toString();
         if (_thumbsucker) {
            System.out.println("Processing: " + fqFileName);
         }
         JSONObject outputJsonFile = new JSONObject();
         JSONArray captureArray = new JSONArray();
         outputJsonFile.put("captureArray", captureArray);
         JSONObject outputJson = new JSONObject();
         String shortFileName = fqFileName
            .substring(fqFileName.lastIndexOf(File.separator) + 1);
         int index = shortFileName.lastIndexOf("." + _ext);
         if (index < 1) {
            System.out.println(fqFileName + " doesn't end with ." + _ext);
            exitVal = -1;
         } else {
         	String outputFileName = _outputPath+shortFileName.substring(0,index)+".json";
            outputJson.put("url", html2urlMap.get(shortFileName));
            BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
            StringBuffer sb = new StringBuffer();
            String line = br.readLine();
            String test = "";
            while (line != null) {
               // remove all indentation and trailing whitespace
               test = line.trim();
               // skip empty lines
               if (test.length() == 0) {
               	line = br.readLine();
               	continue;
               }
               // preserve linebreaks
               sb.append(line);
               line = br.readLine();
            }
            outputJson.put("html", sb.toString());
            captureArray.add(outputJson);
            MDfromHTMLUtils.saveJSONFile(outputFileName, outputJsonFile);
         }
      } catch (Exception e) {
         e.printStackTrace();
         exitVal = -1;
      }
      return exitVal;
   }


   /**
    * Get the parameters necessary for program execution: input directory,
    * output directory, and whether to append annotation details to sentences
    * 
    * @param args
    *           inputPath, outputPath, showAnnotationsFlag
    * @return true if we have sufficient parameters to execute the program
    */
   boolean getParams(String[] args) {
      String inputPath = "." + File.separator + "data" + File.separator
         + "htmls";
      String outputPath = "." + File.separator + "data" + File.separator + "htmljson";
      String inputMapFile = "." + File.separator + "data" + File.separator + "urls_to_htmls.tsv";
      String tmp = "";

      try {
         if (args.length >= 1) {
            inputPath = args[0];
         } else {
            _interactive = true;
            _thumbsucker = true;
            tmp = MDfromHTMLUtils.prompt(
               "Enter the fully qualified path to directory containing " + _ext
                  + " capture files, or q to exit (" + inputPath + "):");
            if (tmp == null || tmp.length() == 0) {
               tmp = inputPath;
            }
            if (tmp.toLowerCase().equals("q")) {
               return false;
            }
            inputPath = tmp;
         }
         if (inputPath.endsWith(File.separator) == false) {
            inputPath += File.separator;
         }
         _inputPath = FileSystems.getDefault().getPath(inputPath);
      } catch (InvalidPathException ipe) {
         System.out.println(
            "Error: " + args[0] + " is not a valid directory to form a path.");
         return false;
      }
      if (args.length >= 2) {
         outputPath = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the htmljson output directory, or q to exit ("
               + outputPath + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = outputPath;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         outputPath = tmp;
      }
      if (outputPath.endsWith(File.separator) == false) {
         outputPath += File.separator;
      }
      File testOutput = new File(outputPath);
      if (testOutput.exists() == false) {
         System.out.println(
            "Error: The output directory \"" + outputPath + "\" must exist.");
         return false;
      }
      if (testOutput.isDirectory() == false) {
         System.out.println("Error: The output directory \"" + outputPath
            + "\" must be a directory.");
         return false;
      }
      _outputPath = outputPath;
      
      if (args.length >= 3) {
         inputMapFile = args[1];
      } else {
         _interactive = true;
         _thumbsucker = true;
         tmp = MDfromHTMLUtils.prompt(
            "Enter the fully qualified path to the url to html file tsv file, or q to exit ("
               + inputMapFile + "):");
         if (tmp == null || tmp.length() == 0) {
            tmp = inputMapFile;
         }
         if (tmp.toLowerCase().equals("q")) {
            return false;
         }
         inputMapFile = tmp;
      }
      File testInputMapFile = new File(inputMapFile);
      if (testInputMapFile.exists() == false) {
         System.out.println(
            "Error: The url to html file tsv file \"" + inputMapFile + "\" must exist.");
         return false;
      }
      if (testInputMapFile.isDirectory() == true) {
         System.out.println("Error: The entry can not be a directory \"" + inputMapFile
            + "\" must be a tsv file.");
         return false;
      }
      _inputMapFile = inputMapFile;

      if (args.length >= 4) {
         _thumbsucker = new Boolean(args[3]);
      }

      return true;
   }

   String _ext = "html";
   Path _inputPath = null;
   String _inputMapFile = "";
   boolean _interactive = false;
   String _outputPath = ".";
   boolean _thumbsucker = false;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy