com.mdfromhtml.markdown.transform.ExtractHTMLJSON Maven / Gradle / Ivy
/**
* (c) Copyright 2019-2020 IBM Corporation
* 1 New Orchard Road,
* Armonk, New York, 10504-1722
* United States
* +1 914 499 1900
* support: Nathaniel Mills [email protected]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.mdfromhtml.markdown.transform;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
/**
* Read the text file produced issuing curl commands to the html_extractor to
* create individual files for use in the ExtractHTMLJSON utility.
*/
public class ExtractHTMLJSON {
/**
*
*/
public ExtractHTMLJSON() {
}
/**
* Main entry point to read a specified input directory to find text files
* containing sequences of JSON Objects to be exgtracted and written to
* separate files in the output directory.
*
* @param args
* inputPath, and outputPath (if not supplied, the program prompts
* for their values)
*/
public static void main(String[] args) {
int exitVal = 0;
ExtractHTMLJSON pgm = new ExtractHTMLJSON();
if (pgm.getParams(args)) {
if (pgm._thumbsucker) {
System.out.println("\nFiles ending with ." + pgm._ext
+ " will be read from " + pgm._inputPath //
+ "\nand the generated htmljson files (.json) " + "saved in "
+ pgm._outputPath); //
}
if (pgm._thumbsucker) {
System.out
.println("\nFilter strings used to check html for bad pages:");
for (String filter : pgm._filters) {
System.out.println(filter);
}
System.out.println();
}
if (pgm._interactive) {
if (MDfromHTMLUtils
.prompt("Press q to quit or press Enter to continue...")
.length() == 0) {
pgm._interactive = false;
}
}
if (!pgm._interactive) {
try {
List files = MDfromHTMLUtils.listSourceFiles(
FileSystems.getDefault().getPath(pgm._inputPath.toString()),
pgm._ext);
for (Path file : files) {
exitVal = pgm.doWork(file);
if (exitVal != 0) {
break;
}
}
} catch (Exception e) {
System.out
.println("Error: Can not reference files with extension "
+ pgm._ext + " in directory " + pgm._inputPath
+ " reason: " + e.getLocalizedMessage());
exitVal = -1;
}
}
if (pgm._thumbsucker) {
System.out.println();
}
} else {
exitVal = -1;
}
if (pgm._thumbsucker) {
System.out.println("Goodbye");
}
System.exit(exitVal);
}
/**
* Process the specified file to transform its content into formatted text
* and save it to a txt file in the specified output directory.
*
* @param file
* the file containing the annotation json from ICCC
* @return exit value (0 indicates success, otherwise -1 for failure)
*/
int doWork(Path file) {
int exitVal = 0;
try {
String fqFileName = file.toString();
if (_thumbsucker) {
System.out.println("Processing: " + fqFileName);
}
BufferedReader br = MDfromHTMLUtils.openTextFile(fqFileName);
StringBuffer sb = new StringBuffer();
String line = br.readLine();
int linenum = 0;
while (line != null) {
linenum++;
try {
if (line.startsWith("}")) {
sb.append(line);
sb.append("\n");
saveFile(sb.toString());
sb = new StringBuffer();
} else {
sb.append(line);
sb.append("\n");
}
line = br.readLine();
} catch (OutOfMemoryError oome) {
sb = new StringBuffer();
System.out.println("Error reading line "+linenum);
// read to next line starting with {
line = br.readLine();
linenum++;
while (line != null) {
if (line.startsWith("{")) {
sb.append(line);
sb.append("\n");
line = br.readLine();
linenum++;
break;
}
line = br.readLine();
if (line.contains("\"url\":")) {
System.out.println("Skipping: "+line);
}
linenum++;
}
System.out.println("Resuming at line "+linenum);
}
}
if (sb.length() > 0) {
try {
saveFile(sb.toString());
} catch (Exception e) {
System.out.println("\n\nError: "+e.getLocalizedMessage()+"\n");
System.out.println(sb.toString());
System.out.println("\n\nEnd Error: "+e.getLocalizedMessage()+"\n");
}
}
MDfromHTMLUtils.closeTextFile(br);
} catch (Exception e) {
e.printStackTrace();
exitVal = -1;
}
return exitVal;
}
/**
* Saves the JSON content to a file in the output directory.
*
* @param jsonContent
* JSON String to be saved as a file.
*/
void saveFile(String jsonContent) {
String outputFileName = _outputPath + _filePrefix
+ MDfromHTMLUtils.padLeft(_fileCounter++, 4, '0') + ".json";
JSONObject obj = null;
try {
Object test = JSON.parse(jsonContent);
if (test instanceof JSONObject) {
obj = (JSONObject)test;
} else {
System.out.println("Error: got a non-JSONObject from parse: "+test);
return;
}
} catch (IOException e) {
System.out.println("Error: Can not transform to JSON: "
+ e.getLocalizedMessage() + "\n" + jsonContent);
return;
} catch (ClassCastException cce) {
System.out.println("Error: Can not parse to JSON: "
+ cce.getLocalizedMessage() + "\n" + jsonContent);
return;
}
try {
if (!filterContent(obj)) {
MDfromHTMLUtils.saveJSONFile(outputFileName, obj);
// System.out.println("Success: wrote file "+outputFileName);
} else {
MDfromHTMLUtils.saveJSONFile(outputFileName + ".rejected", obj);
// System.out.println("Failure: wrote file "+outputFileName);
}
} catch (Exception e) {
System.out.println("Can not save file " + outputFileName + " Error: "
+ e.getLocalizedMessage());
}
}
/**
* Checks the html for filter strings and returns false of none are found. If
* no html nor captureArray then returns true. If the capture array contains
* an object that should be filtered, it is removed from the array.
*
* @param jsonObj
* JSON object to be checked for filters.
* @return
*/
boolean filterContent(JSONObject jsonObj) {
boolean result = true;
if (jsonObj == null) {
return result;
}
JSONArray rejectedURLs = new JSONArray();
JSONArray captureArray = (JSONArray) jsonObj.get("captureArray");
if (captureArray != null) {
JSONObject htmlObj = new JSONObject();
for (Iterator