com.mdfromhtml.markdown.transform.ExtractHTMLJSON Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of MarkdownGenerator Show documentation
Show all versions of MarkdownGenerator Show documentation
Generate markdown (.md) files from html and url provided in JSON files. The name of the generated files will use the name of the JSON file, and an incrementing number starting with 1 for each JSON file read, and for each html reference within the files.
/**
* (c) Copyright 2019-2020 IBM Corporation
* 1 New Orchard Road,
* Armonk, New York, 10504-1722
* United States
* +1 914 499 1900
* support: Nathaniel Mills [email protected]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.mdfromhtml.markdown.transform;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
/**
* Read the text file produced issuing curl commands to the html_extractor to
* create individual files for use in the ExtractHTMLJSON utility.
*/
public class ExtractHTMLJSON {
/**
*
*/
public ExtractHTMLJSON() {
try {
_filters = MDfromHTMLUtils.loadTextFile("RejectStrings.txt");
// rewrite filters in lowercase
List newFilters = new ArrayList();
for (String filter : _filters) {
if (filter.startsWith("#")) {
// skip comments
continue;
}
filter = filter.trim();
if (filter.length() == 0) {
// skip empty lines
continue;
}
newFilters.add(filter.toLowerCase());
}
_filters = newFilters;
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Main entry point to read a specified input directory to find text files
* containing sequences of JSON Objects to be exgtracted and written to
* separate files in the output directory.
*
* @param args
* inputPath, and outputPath (if not supplied, the program prompts
* for their values)
*/
public static void main(String[] args) {
int exitVal = 0;
ExtractHTMLJSON pgm = new ExtractHTMLJSON();
if (pgm.getParams(args)) {
if (pgm._thumbsucker) {
System.out.println("\nFiles ending with ." + pgm._ext
+ " will be read from " + pgm._inputPath //
+ "\nand the generated htmljson files (.json) " + "saved in "
+ pgm._outputPath); //
}
if (pgm._thumbsucker) {
System.out
.println("\nFilter strings used to check html for bad pages:");
for (String filter : pgm._filters) {
System.out.println(filter);
}
System.out.println();
}
if (pgm._interactive) {
if (MDfromHTMLUtils
.prompt("Press q to quit or press Enter to continue...")
.length() == 0) {
pgm._interactive = false;
}
}
if (!pgm._interactive) {
try {
List files = MDfromHTMLUtils.listSourceFiles(
FileSystems.getDefault().getPath(pgm._inputPath.toString()),
pgm._ext);
for (Path file : files) {
exitVal = pgm.doWork(file);
if (exitVal != 0) {
break;
}
}
} catch (Exception e) {
System.out
.println("Error: Can not reference files with extension "
+ pgm._ext + " in directory " + pgm._inputPath
+ " reason: " + e.getLocalizedMessage());
exitVal = -1;
}
}
if (pgm._thumbsucker) {
System.out.println();
}
} else {
exitVal = -1;
}
if (pgm._thumbsucker) {
System.out.println("Goodbye");
}
System.exit(exitVal);
}
/**
* Process the specified file to transform its content into formatted text
* and save it to a txt file in the specified output directory.
*
* @param file
* the file containing the annotation json from ICCC
* @return exit value (0 indicates success, otherwise -1 for failure)
*/
int doWork(Path file) {
int exitVal = 0;
try {
String fqFileName = file.toString();
if (_thumbsucker) {
System.out.println("Processing: " + fqFileName);
}
BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
StringBuffer sb = new StringBuffer();
String line = br.readLine();
int linenum = 0;
while (line != null) {
linenum++;
try {
if (line.startsWith("}")) {
sb.append(line);
sb.append("\n");
saveFile(sb.toString());
sb = new StringBuffer();
} else {
sb.append(line);
sb.append("\n");
}
line = br.readLine();
} catch (OutOfMemoryError oome) {
sb = new StringBuffer();
System.out.println("Error reading line "+linenum);
// read to next line starting with {
line = br.readLine();
linenum++;
while (line != null) {
if (line.startsWith("{")) {
sb.append(line);
sb.append("\n");
line = br.readLine();
linenum++;
break;
}
line = br.readLine();
if (line.contains("\"url\":")) {
System.out.println("Skipping: "+line);
}
linenum++;
}
System.out.println("Resuming at line "+linenum);
}
}
if (sb.length() > 0) {
try {
saveFile(sb.toString());
} catch (Exception e) {
System.out.println("\n\nError: "+e.getLocalizedMessage()+"\n");
System.out.println(sb.toString());
System.out.println("\n\nEnd Error: "+e.getLocalizedMessage()+"\n");
}
}
MDfromHTMLUtils.closeTextFile(br);
} catch (Exception e) {
e.printStackTrace();
exitVal = -1;
}
return exitVal;
}
/**
* Saves the JSON content to a file in the output directory.
*
* @param jsonContent
* JSON String to be saved as a file.
*/
void saveFile(String jsonContent) {
String outputFileName = _outputPath + _filePrefix
+ MDfromHTMLUtils.padLeft(_fileCounter++, 4, '0') + ".json";
JSONObject obj = null;
try {
Object test = JSON.parse(jsonContent);
if (test instanceof JSONObject) {
obj = (JSONObject)test;
} else {
System.out.println("Error: got a non-JSONObject from parse: "+test);
return;
}
} catch (IOException e) {
System.out.println("Error: Can not transform to JSON: "
+ e.getLocalizedMessage() + "\n" + jsonContent);
return;
} catch (ClassCastException cce) {
System.out.println("Error: Can not parse to JSON: "
+ cce.getLocalizedMessage() + "\n" + jsonContent);
return;
}
try {
if (!filterContent(obj)) {
MDfromHTMLUtils.saveJSONFile(outputFileName, obj);
// System.out.println("Success: wrote file "+outputFileName);
} else {
MDfromHTMLUtils.saveJSONFile(outputFileName + ".rejected", obj);
// System.out.println("Failure: wrote file "+outputFileName);
}
} catch (Exception e) {
System.out.println("Can not save file " + outputFileName + " Error: "
+ e.getLocalizedMessage());
}
}
/**
* Checks the html for filter strings and returns false of none are found. If
* no html nor captureArray then returns true. If the capture array contains
* an object that should be filtered, it is removed from the array.
*
* @param jsonObj
* JSON object to be checked for filters.
* @return
*/
boolean filterContent(JSONObject jsonObj) {
boolean result = true;
if (jsonObj == null) {
return result;
}
JSONArray rejectedURLs = new JSONArray();
JSONArray captureArray = (JSONArray) jsonObj.get("captureArray");
if (captureArray != null) {
JSONObject htmlObj = new JSONObject();
for (Iterator
© 2015 - 2025 Weber Informatics LLC | Privacy Policy