All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.mdfromhtml.markdown.transform.GetHTMLJSONfromHTML Maven / Gradle / Ivy
/**
* (c) Copyright 2019-2023 IBM Corporation
* 1 New Orchard Road,
* Armonk, New York, 10504-1722
* United States
* +1 914 499 1900
* support: Nathaniel Mills [email protected]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.mdfromhtml.markdown.transform;
import java.io.BufferedReader;
import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
/**
*
*/
public class GetHTMLJSONfromHTML {
/**
* @param args
*/
public static void main(String[] args) {
int exitVal = 0;
GetHTMLJSONfromHTML pgm = new GetHTMLJSONfromHTML();
if (pgm.getParams(args)) {
if (pgm._thumbsucker) {
System.out.println("\nFiles ending with ." + pgm._ext
+ " will be read from " + pgm._inputPath //
+ "\nand the generated html json file "
+ "saved in " + pgm._outputPath); //
}
if (pgm._interactive) {
if (MDfromHTMLUtils
.prompt("Press q to quit or press Enter to continue...")
.length() == 0) {
pgm._interactive = false;
}
}
if (!pgm._interactive) {
try {
List inputLines = MDfromHTMLUtils.loadTextFile(pgm._inputMapFile);
Map html2urlMap = new HashMap();
String[] parts = null;
int lineNum = 0;
for (String inputLine: inputLines) {
lineNum++;
parts = inputLine.split("\t");
if (parts.length != 2) {
System.out.println("Errant line: "+lineNum+": "+inputLine);
continue;
}
html2urlMap.put(parts[1].trim(), parts[0].trim());
}
List files = MDfromHTMLUtils.listSourceFiles(
FileSystems.getDefault().getPath(pgm._inputPath.toString()),
pgm._ext);
for (Path file : files) {
try {
exitVal = pgm.doWork(file,html2urlMap);
if (exitVal != 0) {
break;
}
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (Exception e) {
System.out
.println("Error: Can not reference files with extension "
+ pgm._ext + " in directory " + pgm._inputPath
+ " reason: " + e.getLocalizedMessage());
exitVal = -1;
}
}
if (pgm._thumbsucker) {
System.out.println();
}
} else {
exitVal = -1;
}
if (pgm._thumbsucker) {
System.out.println("Goodbye");
}
System.exit(exitVal);
}
/**
* escape()
*
* Escape a given input String of HTML to enable it to be stored in JSON as a quoted String value
* retaining its characteristics when retrieved and referenced as a Java String.
*
* @param str The input String.
* @return The output String.
**/
public String escapeString(String str){
return str.replace("\\", "\\\\")
.replace("\b", "\\b")
.replace("\f", "\\f")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
.replace("\'", "\\'")
.replace("\"", "\\\"");
}
/**
* Process the specified html file to transform its content into an htmljson
* file and save it to the specified output directory.
*
* @param file
* the file containing the raw html captured by as text
* @return exit value (0 indicates success, otherwise -1 for failure)
*/
int doWork(Path file, Maphtml2urlMap) {
int exitVal = 0;
try {
String fqFileName = file.toString();
if (_thumbsucker) {
System.out.println("Processing: " + fqFileName);
}
JSONObject outputJsonFile = new JSONObject();
JSONArray captureArray = new JSONArray();
outputJsonFile.put("captureArray", captureArray);
JSONObject outputJson = new JSONObject();
String shortFileName = fqFileName
.substring(fqFileName.lastIndexOf(File.separator) + 1);
int index = shortFileName.lastIndexOf("." + _ext);
if (index < 1) {
System.out.println(fqFileName + " doesn't end with ." + _ext);
exitVal = -1;
} else {
String outputFileName = _outputPath+shortFileName.substring(0,index)+".json";
outputJson.put("url", html2urlMap.get(shortFileName));
BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
StringBuffer sb = new StringBuffer();
String line = br.readLine();
String test = "";
while (line != null) {
// remove all indentation and trailing whitespace
test = line.trim();
// skip empty lines
if (test.length() == 0) {
line = br.readLine();
continue;
}
// preserve linebreaks
sb.append(line);
line = br.readLine();
}
outputJson.put("html", sb.toString());
captureArray.add(outputJson);
MDfromHTMLUtils.saveJSONFile(outputFileName, outputJsonFile);
}
} catch (Exception e) {
e.printStackTrace();
exitVal = -1;
}
return exitVal;
}
/**
* Get the parameters necessary for program execution: input directory,
* output directory, and whether to append annotation details to sentences
*
* @param args
* inputPath, outputPath, showAnnotationsFlag
* @return true if we have sufficient parameters to execute the program
*/
boolean getParams(String[] args) {
String inputPath = "." + File.separator + "data" + File.separator
+ "htmls";
String outputPath = "." + File.separator + "data" + File.separator + "htmljson";
String inputMapFile = "." + File.separator + "data" + File.separator + "urls_to_htmls.tsv";
String tmp = "";
try {
if (args.length >= 1) {
inputPath = args[0];
} else {
_interactive = true;
_thumbsucker = true;
tmp = MDfromHTMLUtils.prompt(
"Enter the fully qualified path to directory containing " + _ext
+ " capture files, or q to exit (" + inputPath + "):");
if (tmp == null || tmp.length() == 0) {
tmp = inputPath;
}
if (tmp.toLowerCase().equals("q")) {
return false;
}
inputPath = tmp;
}
if (inputPath.endsWith(File.separator) == false) {
inputPath += File.separator;
}
_inputPath = FileSystems.getDefault().getPath(inputPath);
} catch (InvalidPathException ipe) {
System.out.println(
"Error: " + args[0] + " is not a valid directory to form a path.");
return false;
}
if (args.length >= 2) {
outputPath = args[1];
} else {
_interactive = true;
_thumbsucker = true;
tmp = MDfromHTMLUtils.prompt(
"Enter the fully qualified path to the htmljson output directory, or q to exit ("
+ outputPath + "):");
if (tmp == null || tmp.length() == 0) {
tmp = outputPath;
}
if (tmp.toLowerCase().equals("q")) {
return false;
}
outputPath = tmp;
}
if (outputPath.endsWith(File.separator) == false) {
outputPath += File.separator;
}
File testOutput = new File(outputPath);
if (testOutput.exists() == false) {
System.out.println(
"Error: The output directory \"" + outputPath + "\" must exist.");
return false;
}
if (testOutput.isDirectory() == false) {
System.out.println("Error: The output directory \"" + outputPath
+ "\" must be a directory.");
return false;
}
_outputPath = outputPath;
if (args.length >= 3) {
inputMapFile = args[1];
} else {
_interactive = true;
_thumbsucker = true;
tmp = MDfromHTMLUtils.prompt(
"Enter the fully qualified path to the url to html file tsv file, or q to exit ("
+ inputMapFile + "):");
if (tmp == null || tmp.length() == 0) {
tmp = inputMapFile;
}
if (tmp.toLowerCase().equals("q")) {
return false;
}
inputMapFile = tmp;
}
File testInputMapFile = new File(inputMapFile);
if (testInputMapFile.exists() == false) {
System.out.println(
"Error: The url to html file tsv file \"" + inputMapFile + "\" must exist.");
return false;
}
if (testInputMapFile.isDirectory() == true) {
System.out.println("Error: The entry can not be a directory \"" + inputMapFile
+ "\" must be a tsv file.");
return false;
}
_inputMapFile = inputMapFile;
if (args.length >= 4) {
_thumbsucker = new Boolean(args[3]);
}
return true;
}
String _ext = "html";
Path _inputPath = null;
String _inputMapFile = "";
boolean _interactive = false;
String _outputPath = ".";
boolean _thumbsucker = false;
}